PyPI - docling - Versions diffs - 2.45.0__tar.gz → 2.46.0__tar.gz - Mend

docling 2.45.0tar.gz → 2.46.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

{docling-2.45.0 → docling-2.46.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.45.0
+Version: 2.46.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
-Requires-Dist: docling-parse<5.0.0,>=4.0.0
+Requires-Dist: docling-parse<5.0.0,>=4.2.2
 Requires-Dist: docling-ibm-models<4,>=3.9.0
 Requires-Dist: filetype<2.0.0,>=1.2.0
 Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0

{docling-2.45.0 → docling-2.46.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

@@ -22,15 +22,52 @@ _log = logging.getLogger(__name__)
 class DoclingParseV4PageBackend(PdfPageBackend):
-    def __init__(self, parsed_page: SegmentedPdfPage, page_obj: PdfPage):
+    def __init__(
+        self,
+        *,
+        dp_doc: PdfDocument,
+        page_obj: PdfPage,
+        page_no: int,
+        create_words: bool = True,
+        create_textlines: bool = True,
+    ):
         self._ppage = page_obj
-        self._dpage = parsed_page
-        self.valid = parsed_page is not None
+        self._dp_doc = dp_doc
+        self._page_no = page_no
+        self._create_words = create_words
+        self._create_textlines = create_textlines
+        self._dpage: Optional[SegmentedPdfPage] = None
+        self._unloaded = False
+        self.valid = (self._ppage is not None) and (self._dp_doc is not None)
+    def _ensure_parsed(self) -> None:
+        if self._dpage is not None:
+            return
+        seg_page = self._dp_doc.get_page(
+            self._page_no + 1,
+            create_words=self._create_words,
+            create_textlines=self._create_textlines,
+        )
+        # In Docling, all TextCell instances are expected with top-left origin.
+        [
+            tc.to_top_left_origin(seg_page.dimension.height)
+            for tc in seg_page.textline_cells
+        ]
+        [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.char_cells]
+        [tc.to_top_left_origin(seg_page.dimension.height) for tc in seg_page.word_cells]
+        self._dpage = seg_page
     def is_valid(self) -> bool:
         return self.valid
     def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        self._ensure_parsed()
+        assert self._dpage is not None
         # Find intersecting cells on the page
         text_piece = ""
         page_size = self.get_size()
@@ -56,12 +93,19 @@ class DoclingParseV4PageBackend(PdfPageBackend):
         return text_piece
     def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        self._ensure_parsed()
         return self._dpage
     def get_text_cells(self) -> Iterable[TextCell]:
+        self._ensure_parsed()
+        assert self._dpage is not None
         return self._dpage.textline_cells
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        self._ensure_parsed()
+        assert self._dpage is not None
         AREA_THRESHOLD = 0  # 32 * 32
         images = self._dpage.bitmap_resources
@@ -123,8 +167,13 @@ class DoclingParseV4PageBackend(PdfPageBackend):
         # )
     def unload(self):
+        if not self._unloaded and self._dp_doc is not None:
+            self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
+            self._unloaded = True
         self._ppage = None
         self._dpage = None
+        self._dp_doc = None
 class DoclingParseV4DocumentBackend(PdfDocumentBackend):
@@ -157,30 +206,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
         self, page_no: int, create_words: bool = True, create_textlines: bool = True
     ) -> DoclingParseV4PageBackend:
         with pypdfium2_lock:
-            seg_page = self.dp_doc.get_page(
-                page_no + 1,
-                create_words=create_words,
-                create_textlines=create_textlines,
-            )
-            # In Docling, all TextCell instances are expected with top-left origin.
-            [
-                tc.to_top_left_origin(seg_page.dimension.height)
-                for tc in seg_page.textline_cells
-            ]
-            [
-                tc.to_top_left_origin(seg_page.dimension.height)
-                for tc in seg_page.char_cells
-            ]
-            [
-                tc.to_top_left_origin(seg_page.dimension.height)
-                for tc in seg_page.word_cells
-            ]
-            return DoclingParseV4PageBackend(
-                seg_page,
-                self._pdoc[page_no],
-            )
+            ppage = self._pdoc[page_no]
+        return DoclingParseV4PageBackend(
+            dp_doc=self.dp_doc,
+            page_obj=ppage,
+            page_no=page_no,
+            create_words=create_words,
+            create_textlines=create_textlines,
+        )
     def is_valid(self) -> bool:
         return self.page_count() > 0

{docling-2.45.0 → docling-2.46.0}/docling/backend/html_backend.py RENAMED Viewed

@@ -38,6 +38,7 @@ _BLOCK_TAGS: Final = {
     "address",
     "details",
     "figure",
+    "footer",
     "h1",
     "h2",
     "h3",
@@ -639,10 +640,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         hyperlink=annotated_text.hyperlink,
                     )
-        elif tag_name == "details":
-            # handle details and its content.
+        elif tag_name in {"details", "footer"}:
+            if tag_name == "footer":
+                current_layer = self.content_layer
+                self.content_layer = ContentLayer.FURNITURE
             self.parents[self.level + 1] = doc.add_group(
-                name="details",
+                name=tag_name,
                 label=GroupLabel.SECTION,
                 parent=self.parents[self.level],
                 content_layer=self.content_layer,
@@ -651,6 +654,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             self._walk(tag, doc)
             self.parents[self.level + 1] = None
             self.level -= 1
+            if tag_name == "footer":
+                self.content_layer = current_layer
     def _emit_image(self, img_tag: Tag, doc: DoclingDocument) -> None:
         figure = img_tag.find_parent("figure")
@@ -686,7 +691,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             text_clean = HTMLDocumentBackend._clean_unicode(
                 caption_anno_text.text.strip()
             )
-            print(caption_anno_text)
             caption_item = doc.add_text(
                 label=DocItemLabel.CAPTION,
                 text=text_clean,

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

@@ -323,9 +323,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
         ),
     )
-    generate_parsed_pages: Literal[True] = (
-        True  # Always True since parsed_page is now mandatory
-    )
+    generate_parsed_pages: bool = False
 class ProcessingPipeline(str, Enum):

{docling-2.45.0 → docling-2.46.0}/docling/models/code_formula_model.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import re
-from collections import Counter
 from collections.abc import Iterable
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
@@ -13,10 +12,11 @@ from docling_core.types.doc import (
     TextItem,
 )
 from docling_core.types.doc.labels import CodeLanguageLabel
-from PIL import Image, ImageOps
+from PIL import Image
 from pydantic import BaseModel
+from transformers import AutoModelForImageTextToText, AutoProcessor
-from docling.datamodel.accelerator_options import AcceleratorOptions
+from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement
 from docling.models.base_model import BaseItemAndImageEnrichmentModel
 from docling.models.utils.hf_model_download import download_hf_model
@@ -65,9 +65,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         Processes the given batch of elements and enriches them with predictions.
     """
-    _model_repo_folder = "ds4sd--CodeFormula"
+    _model_repo_folder = "ds4sd--CodeFormulaV2"
     elements_batch_size = 5
-    images_scale = 1.66  # = 120 dpi, aligned with training data resolution
+    images_scale = 1.67  # = 120 dpi, aligned with training data resolution
     expansion_factor = 0.18
     def __init__(
@@ -95,10 +95,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         self.options = options
         if self.enabled:
-            device = decide_device(accelerator_options.device)
-            from docling_ibm_models.code_formula_model.code_formula_predictor import (
-                CodeFormulaPredictor,
+            self.device = decide_device(
+                accelerator_options.device,
+                supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
             )
             if artifacts_path is None:
@@ -106,11 +105,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
             else:
                 artifacts_path = artifacts_path / self._model_repo_folder
-            self.code_formula_model = CodeFormulaPredictor(
-                artifacts_path=str(artifacts_path),
-                device=device,
-                num_threads=accelerator_options.num_threads,
+            self._processor = AutoProcessor.from_pretrained(
+                artifacts_path,
+            )
+            self._model_max_length = self._processor.tokenizer.model_max_length
+            self._model = AutoModelForImageTextToText.from_pretrained(
+                artifacts_path, device_map=self.device
             )
+            self._model.eval()
     @staticmethod
     def download_models(
@@ -119,8 +121,8 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         progress: bool = False,
     ) -> Path:
         return download_hf_model(
-            repo_id="ds4sd/CodeFormula",
-            revision="v1.0.2",
+            repo_id="ds4sd/CodeFormulaV2",
+            revision="main",
             local_dir=local_dir,
             force=force,
             progress=progress,
@@ -172,7 +174,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
                 - The second element is the extracted language if a match is found;
                 otherwise, `None`.
         """
-        pattern = r"^<_([^_>]+)_>\s(.*)"
+        pattern = r"^<_([^_>]+)_>\s*(.*)"
         match = re.match(pattern, input_string, flags=re.DOTALL)
         if match:
             language = str(match.group(1))  # the captured programming language
@@ -203,81 +205,74 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         except ValueError:
             return CodeLanguageLabel.UNKNOWN
-    def _get_most_frequent_edge_color(self, pil_img: Image.Image):
+    def _get_prompt(self, label: str) -> str:
         """
-        Compute the most frequent color along the outer edges of a PIL image.
+        Constructs the prompt for the model based on the input label.
         Parameters
         ----------
-            pil_img : Image.Image
-                A PIL Image in any mode (L, RGB, RGBA, etc.).
+        label : str
+            The type of input, either 'code' or 'formula'.
         Returns
         -------
-            (int) or (tuple): The most common edge color as a scalar (for grayscale) or
-                tuple (for RGB/RGBA).
+        str
+            The constructed prompt including necessary tokens and query.
+        Raises
+        ------
+        NotImplementedError
+            If the label is not 'code' or 'formula'.
         """
-        # Convert to NumPy array for easy pixel access
-        img_np = np.array(pil_img)
+        if label == "code":
+            query = "<code>"
+        elif label == "formula":
+            query = "<formula>"
+        else:
+            raise NotImplementedError("Label must be either code or formula")
-        if img_np.ndim == 2:
-            # Grayscale-like image: shape (H, W)
-            # Extract edges: top row, bottom row, left col, right col
-            top = img_np[0, :]  # shape (W,)
-            bottom = img_np[-1, :]  # shape (W,)
-            left = img_np[:, 0]  # shape (H,)
-            right = img_np[:, -1]  # shape (H,)
+        messages = [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": query}],
+            },
+        ]
-            # Concatenate all edges
-            edges = np.concatenate([top, bottom, left, right])
+        prompt = self._processor.apply_chat_template(
+            messages, add_generation_prompt=True
+        )
-            # Count frequencies
-            freq = Counter(edges.tolist())
-            most_common_value, _ = freq.most_common(1)[0]
-            return int(most_common_value)  # single channel color
+        return prompt
-        else:
-            # Color image: shape (H, W, C)
-            top = img_np[0, :, :]  # shape (W, C)
-            bottom = img_np[-1, :, :]  # shape (W, C)
-            left = img_np[:, 0, :]  # shape (H, C)
-            right = img_np[:, -1, :]  # shape (H, C)
-            # Concatenate edges along first axis
-            edges = np.concatenate([top, bottom, left, right], axis=0)
-            # Convert each color to a tuple for counting
-            edges_as_tuples = [tuple(pixel) for pixel in edges]
-            freq = Counter(edges_as_tuples)
-            most_common_value, _ = freq.most_common(1)[0]
-            return most_common_value  # e.g. (R, G, B) or (R, G, B, A)
-    def _pad_with_most_frequent_edge_color(
-        self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
-    ):
+    def _post_process(self, texts: list[str]) -> list[str]:
         """
-        Pads an image (PIL or NumPy array) using the most frequent edge color.
+        Processes a list of text strings by truncating at '<end_of_utterance>' and
+        removing a predefined set of unwanted substrings.
         Parameters
         ----------
-            img : Union[Image.Image, np.ndarray]
-                The original image.
-            padding : tuple
-                Padding (left, top, right, bottom) in pixels.
+        texts : list[str]
+            A list of strings to be post-processed.
         Returns
         -------
-            Image.Image: A new PIL image with the specified padding.
+        list[str]
+            A list of cleaned strings with specified substrings removed and truncated at
+                '<end_of_utterance>' if present.
         """
-        if isinstance(img, np.ndarray):
-            pil_img = Image.fromarray(img)
-        else:
-            pil_img = img
+        to_remove = ["</code>", "</formula>", "<loc_0><loc_0><loc_500><loc_500>"]
-        most_freq_color = self._get_most_frequent_edge_color(pil_img)
+        def clean_text(text: str) -> str:
+            idx = text.find("<end_of_utterance>")
+            if idx != -1:
+                text = text[:idx]
-        padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
-        return padded_img
+            for token in to_remove:
+                if token in text:
+                    text = text.replace(token, "")
+            return text.lstrip()
+        return [clean_text(t) for t in texts]
     def __call__(
         self,
@@ -308,14 +303,30 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         images: List[Union[Image.Image, np.ndarray]] = []
         elements: List[TextItem] = []
         for el in element_batch:
-            assert isinstance(el.item, TextItem)
-            elements.append(el.item)
-            labels.append(el.item.label)
-            images.append(
-                self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
-            )
+            elements.append(el.item)  # type: ignore[arg-type]
+            labels.append(el.item.label)  # type: ignore[attr-defined]
+            images.append(el.image)
+        prompts = [self._get_prompt(label) for label in labels]
+        inputs = self._processor(
+            text=prompts,
+            images=images,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.device)
-        outputs = self.code_formula_model.predict(images, labels)
+        gen_kwargs = dict(
+            max_new_tokens=self._model_max_length - inputs.input_ids.shape[1],
+            use_cache=True,
+            do_sample=False,
+        )
+        generated_ids = self._model.generate(**inputs, **gen_kwargs)
+        outputs = self._processor.batch_decode(
+            generated_ids[:, inputs.input_ids.shape[1] :], skip_special_tokens=False
+        )
+        outputs = self._post_process(outputs)
         for item, output in zip(elements, outputs):
             if isinstance(item, CodeItem):

{docling-2.45.0 → docling-2.46.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

@@ -320,6 +320,8 @@ class TesseractOcrCliModel(BaseOcrModel):
 def _parse_orientation(df_osd: pd.DataFrame) -> int:
-    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
-    orientation = parse_tesseract_orientation(orientations[0].strip())
+    # For strictly optimal performance with invariant dataframe format:
+    mask = df_osd["key"].to_numpy() == "Orientation in degrees"
+    orientation_val = df_osd["value"].to_numpy()[mask][0]
+    orientation = parse_tesseract_orientation(orientation_val.strip())
     return orientation

{docling-2.45.0 → docling-2.46.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

@@ -20,7 +20,7 @@ from docling.datamodel.base_models import (
     Page,
 )
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import GenericEnrichmentModel
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -168,6 +168,12 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                         # Cleanup page backends
                         if not self.keep_backend and p._backend is not None:
                             p._backend.unload()
+                        if (
+                            isinstance(self.pipeline_options, PdfPipelineOptions)
+                            and not self.pipeline_options.generate_parsed_pages
+                        ):
+                            del p.parsed_page
+                            p.parsed_page = None
                     end_batch_time = time.monotonic()
                     total_elapsed_time += end_batch_time - start_batch_time

{docling-2.45.0 → docling-2.46.0}/docling/pipeline/threaded_standard_pdf_pipeline.py RENAMED Viewed

@@ -565,10 +565,12 @@ class ThreadedStandardPdfPipeline(BasePipeline):
         if not self.keep_images:
             for p in conv_res.pages:
                 p._image_cache = {}
-        if not self.keep_backend:
-            for p in conv_res.pages:
-                if p._backend is not None:
-                    p._backend.unload()
+        for p in conv_res.pages:
+            if not self.keep_backend and p._backend is not None:
+                p._backend.unload()
+            if not self.pipeline_options.generate_parsed_pages:
+                del p.parsed_page
+                p.parsed_page = None
     # ---------------------------------------------------------------- assemble
     def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:

{docling-2.45.0 → docling-2.46.0}/docling.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.45.0
+Version: 2.46.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
-Requires-Dist: docling-parse<5.0.0,>=4.0.0
+Requires-Dist: docling-parse<5.0.0,>=4.2.2
 Requires-Dist: docling-ibm-models<4,>=3.9.0
 Requires-Dist: filetype<2.0.0,>=1.2.0
 Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0

{docling-2.45.0 → docling-2.46.0}/docling.egg-info/requires.txt RENAMED Viewed

@@ -1,6 +1,6 @@
 pydantic<3.0.0,>=2.0.0
 docling-core[chunking]<3.0.0,>=2.42.0
-docling-parse<5.0.0,>=4.0.0
+docling-parse<5.0.0,>=4.2.2
 docling-ibm-models<4,>=3.9.0
 filetype<2.0.0,>=1.2.0
 pypdfium2!=4.30.1,<5.0.0,>=4.30.0

{docling-2.45.0 → docling-2.46.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.45.0"  # DO NOT EDIT, updated automatically
+version = "2.46.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
@@ -45,7 +45,7 @@ requires-python = '>=3.9,<4.0'
 dependencies = [
   'pydantic (>=2.0.0,<3.0.0)',
   'docling-core[chunking] (>=2.42.0,<3.0.0)',
-  'docling-parse (>=4.0.0,<5.0.0)',
+  'docling-parse (>=4.2.2,<5.0.0)',
   "docling-ibm-models>=3.9.0,<4",
   'filetype (>=1.2.0,<2.0.0)',
   'pypdfium2 (>=4.30.0,!=4.30.1,<5.0.0)',

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_html.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from io import BytesIO
 from pathlib import Path
+from docling_core.types.doc.document import ContentLayer
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import (
@@ -179,3 +181,33 @@ def test_e2e_html_conversions():
         )
         assert verify_document(doc, str(gt_path) + ".json", GENERATE)
+def test_html_furniture():
+    raw_html = (
+        b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
+        b"<h1>Main Heading</h1>"
+        b"<p>Some Content</p>"
+        b"<footer><p>Some Footer Content</p></footer></body></html"
+    )
+    in_doc = InputDocument(
+        path_or_stream=BytesIO(raw_html),
+        format=InputFormat.HTML,
+        backend=HTMLDocumentBackend,
+        filename="test",
+    )
+    backend = HTMLDocumentBackend(
+        in_doc=in_doc,
+        path_or_stream=BytesIO(raw_html),
+    )
+    doc: DoclingDocument = backend.convert()
+    md_body = doc.export_to_markdown()
+    assert md_body == "# Main Heading\n\nSome Content"
+    md_all = doc.export_to_markdown(
+        included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
+    )
+    assert md_all == (
+        "Initial content with some bold text\n\n# Main Heading\n\nSome Content\n\n"
+        "Some Footer Content"
+    )

{docling-2.45.0 → docling-2.46.0}/tests/test_e2e_conversion.py RENAMED Viewed

@@ -27,6 +27,7 @@ def get_converter():
     pipeline_options.do_table_structure = True
     pipeline_options.table_structure_options.do_cell_matching = True
     pipeline_options.accelerator_options.device = AcceleratorDevice.CPU
+    pipeline_options.generate_parsed_pages = True
     converter = DocumentConverter(
         format_options={

{docling-2.45.0 → docling-2.46.0}/tests/test_interfaces.py RENAMED Viewed

@@ -3,6 +3,7 @@ from pathlib import Path
 import pytest
+from docling.datamodel.accelerator_options import AcceleratorDevice
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -24,6 +25,8 @@ def converter():
     pipeline_options.do_ocr = False
     pipeline_options.do_table_structure = True
     pipeline_options.table_structure_options.do_cell_matching = True
+    pipeline_options.accelerator_options.device = AcceleratorDevice.CPU
+    pipeline_options.generate_parsed_pages = True
     converter = DocumentConverter(
         format_options={

{docling-2.45.0 → docling-2.46.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/README.md RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/csv_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/docx/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/docx/latex/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/docx/latex/latex_dict.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/docx/latex/omml.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/json/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/json/docling_json_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/mets_gbs_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/noop_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/xml/jats_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/cli/main.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/cli/models.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/cli/tools.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/accelerator_options.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/asr_model_specs.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/base_models.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/document.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/layout_model_specs.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/pipeline_options_asr_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/pipeline_options_vlm_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/datamodel/vlm_model_specs.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/document_converter.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/api_vlm_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/factories/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/factories/base_factory.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/factories/ocr_factory.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/factories/picture_description_factory.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/layout_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/picture_description_api_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/picture_description_base_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/picture_description_vlm_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/plugins/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/plugins/defaults.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/readingorder_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/utils/hf_model_download.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/vlm_models_inline/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/vlm_models_inline/hf_transformers_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/models/vlm_models_inline/mlx_model.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/pipeline/asr_pipeline.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/pipeline/vlm_pipeline.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/api_image_request.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/locks.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/model_downloader.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/ocr_utils.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/orientation.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/utils.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling/utils/visualization.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling.egg-info/entry_points.txt RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/docling.egg-info/top_level.txt RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/setup.cfg RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_asr_pipeline.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_asciidoc.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_csv.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_json.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_parse.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_parse_v2.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_docling_parse_v4.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_jats.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_markdown.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_mets_gbs.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_msexcel.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_msword.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_patent_uspto.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_pdfium.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_pptx.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_backend_webp.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_cli.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_code_formula.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_data_gen_flag.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_e2e_ocr_conversion.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_input_doc.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_invalid_input.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_legacy_format_transform.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_ocr_utils.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_options.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_settings_load.py RENAMED Viewed

File without changes

{docling-2.45.0 → docling-2.46.0}/tests/test_threaded_pipeline.py RENAMED Viewed

File without changes

docling 2.45.0__tar.gz → 2.46.0__tar.gz

docling 2.45.0tar.gz → 2.46.0tar.gz