PyPI - docling - Versions diffs - 2.28.0__tar.gz → 2.28.1__tar.gz - Mend

docling 2.28.0tar.gz → 2.28.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

{docling-2.28.0 → docling-2.28.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.28.0
+Version: 2.28.1
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/docling-project/docling
 License: MIT

{docling-2.28.0 → docling-2.28.1}/docling/backend/msword_backend.py RENAMED Viewed

@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         self.max_levels: int = 10
         self.level_at_new_list: Optional[int] = None
         self.parents: dict[int, Optional[NodeItem]] = {}
+        self.numbered_headers: dict[int, int] = {}
         for i in range(-1, self.max_levels):
             self.parents[i] = None
@@ -346,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                 parent=None, label=DocItemLabel.TITLE, text=text
             )
         elif "Heading" in p_style_id:
-            self.add_header(doc, p_level, text)
+            style_element = getattr(paragraph.style, "element", None)
+            if style_element:
+                is_numbered_style = (
+                    "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
+                )
+            else:
+                is_numbered_style = False
+            self.add_header(doc, p_level, text, is_numbered_style)
         elif len(equations) > 0:
             if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
@@ -415,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
         return
     def add_header(
-        self, doc: DoclingDocument, curr_level: Optional[int], text: str
+        self,
+        doc: DoclingDocument,
+        curr_level: Optional[int],
+        text: str,
+        is_numbered_style: bool = False,
     ) -> None:
         level = self.get_level()
         if isinstance(curr_level, int):
@@ -433,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                     if key >= curr_level:
                         self.parents[key] = None
-            self.parents[curr_level] = doc.add_heading(
-                parent=self.parents[curr_level - 1],
-                text=text,
-                level=curr_level,
-            )
+            current_level = curr_level
+            parent_level = curr_level - 1
+            add_level = curr_level
         else:
-            self.parents[self.level] = doc.add_heading(
-                parent=self.parents[self.level - 1],
-                text=text,
-                level=1,
-            )
+            current_level = self.level
+            parent_level = self.level - 1
+            add_level = 1
+        if is_numbered_style:
+            if add_level in self.numbered_headers:
+                self.numbered_headers[add_level] += 1
+            else:
+                self.numbered_headers[add_level] = 1
+            text = f"{self.numbered_headers[add_level]} {text}"
+            # Reset deeper levels
+            next_level = add_level + 1
+            while next_level in self.numbered_headers:
+                self.numbered_headers[next_level] = 0
+                next_level += 1
+            # Scan upper levels
+            previous_level = add_level - 1
+            while previous_level in self.numbered_headers:
+                # MSWord convention: no empty sublevels
+                # I.e., sub-sub section (2.0.1) without a sub-section (2.1)
+                # is processed as 2.1.1
+                if self.numbered_headers[previous_level] == 0:
+                    self.numbered_headers[previous_level] += 1
+                text = f"{self.numbered_headers[previous_level]}.{text}"
+                previous_level -= 1
+        self.parents[current_level] = doc.add_heading(
+            parent=self.parents[parent_level],
+            text=text,
+            level=add_level,
+        )
         return
     def add_listitem(

{docling-2.28.0 → docling-2.28.1}/docling/document_converter.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import math
 import sys
@@ -181,7 +182,14 @@ class DocumentConverter:
             )
             for format in self.allowed_formats
         }
-        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
+        self.initialized_pipelines: Dict[
+            Tuple[Type[BasePipeline], str], BasePipeline
+        ] = {}
+    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
+        """Generate a hash of pipeline options to use as part of the cache key."""
+        options_str = str(pipeline_options.model_dump())
+        return hashlib.md5(options_str.encode("utf-8")).hexdigest()
     def initialize_pipeline(self, format: InputFormat):
         """Initialize the conversion pipeline for the selected format."""
@@ -279,31 +287,36 @@ class DocumentConverter:
                 yield item
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
+        """Retrieve or initialize a pipeline, reusing instances based on class and options."""
         fopt = self.format_to_options.get(doc_format)
-        if fopt is None:
+        if fopt is None or fopt.pipeline_options is None:
             return None
-        else:
-            pipeline_class = fopt.pipeline_cls
-            pipeline_options = fopt.pipeline_options
-        if pipeline_options is None:
-            return None
-        # TODO this will ignore if different options have been defined for the same pipeline class.
-        if (
-            pipeline_class not in self.initialized_pipelines
-            or self.initialized_pipelines[pipeline_class].pipeline_options
-            != pipeline_options
-        ):
-            self.initialized_pipelines[pipeline_class] = pipeline_class(
+        pipeline_class = fopt.pipeline_cls
+        pipeline_options = fopt.pipeline_options
+        options_hash = self._get_pipeline_options_hash(pipeline_options)
+        # Use a composite key to cache pipelines
+        cache_key = (pipeline_class, options_hash)
+        if cache_key not in self.initialized_pipelines:
+            _log.info(
+                f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
+            )
+            self.initialized_pipelines[cache_key] = pipeline_class(
                 pipeline_options=pipeline_options
             )
-        return self.initialized_pipelines[pipeline_class]
+        else:
+            _log.debug(
+                f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
+            )
+        return self.initialized_pipelines[cache_key]
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool
     ) -> ConversionResult:
         valid = (
             self.allowed_formats is not None and in_doc.format in self.allowed_formats
         )
@@ -345,7 +358,6 @@ class DocumentConverter:
         else:
             if raises_on_error:
                 raise ConversionError(f"Input document {in_doc.file} is not valid.")
             else:
                 # invalid doc or not of desired format
                 conv_res = ConversionResult(

{docling-2.28.0 → docling-2.28.1}/docling/models/page_preprocessing_model.py RENAMED Viewed

@@ -63,7 +63,13 @@ class PagePreprocessingModel(BasePageModel):
         def draw_text_boxes(image, cells, show: bool = False):
             draw = ImageDraw.Draw(image)
             for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
+                x0, y0, x1, y1 = (
+                    c.to_bounding_box().l,
+                    c.to_bounding_box().t,
+                    c.to_bounding_box().r,
+                    c.to_bounding_box().b,
+                )
                 draw.rectangle([(x0, y0), (x1, y1)], outline="red")
             if show:
                 image.show()

{docling-2.28.0 → docling-2.28.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.28.0"  # DO NOT EDIT, updated automatically
+version = "2.28.1"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
   "Christoph Auer <cau@zurich.ibm.com>",

{docling-2.28.0 → docling-2.28.1}/LICENSE RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/README.md RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/csv_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/docx/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/docx/latex/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/docx/latex/latex_dict.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/docx/latex/omml.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/html_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/json/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/json/docling_json_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/xml/jats_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/cli/main.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/cli/models.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/cli/tools.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/datamodel/base_models.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/datamodel/document.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/datamodel/pipeline_options.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/code_formula_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/factories/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/factories/base_factory.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/factories/ocr_factory.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/factories/picture_description_factory.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/hf_mlx_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/hf_vlm_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/layout_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/picture_description_api_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/picture_description_base_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/picture_description_vlm_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/plugins/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/plugins/defaults.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/readingorder_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/pipeline/vlm_pipeline.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/locks.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/model_downloader.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/ocr_utils.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/utils.py RENAMED Viewed

File without changes

{docling-2.28.0 → docling-2.28.1}/docling/utils/visualization.py RENAMED Viewed

File without changes

docling 2.28.0__tar.gz → 2.28.1__tar.gz

docling 2.28.0tar.gz → 2.28.1tar.gz