PyPI - docling - Versions diffs - 2.34.0__tar.gz → 2.35.0__tar.gz - Mend

docling 2.34.0tar.gz → 2.35.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

{docling-2.34.0 → docling-2.35.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.34.0
+Version: 2.35.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/docling-project/docling
 License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platfo
 Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
 Requires-Dist: certifi (>=2024.7.4)
 Requires-Dist: click (<8.2.0)
-Requires-Dist: docling-core[chunking] (>=2.29.0,<3.0.0)
+Requires-Dist: docling-core[chunking] (>=2.31.2,<3.0.0)
 Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
 Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
 Requires-Dist: easyocr (>=1.7,<2.0)

{docling-2.34.0 → docling-2.35.0}/docling/cli/main.py RENAMED Viewed

@@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
 import rich.table
 import typer
+from docling_core.transforms.serializer.html import (
+    HTMLDocSerializer,
+    HTMLOutputStyle,
+    HTMLParams,
+)
+from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
@@ -156,6 +162,7 @@ def export_documents(
     export_json: bool,
     export_html: bool,
     export_html_split_page: bool,
+    show_layout: bool,
     export_md: bool,
     export_txt: bool,
     export_doctags: bool,
@@ -189,9 +196,27 @@ def export_documents(
             if export_html_split_page:
                 fname = output_dir / f"{doc_filename}.html"
                 _log.info(f"writing HTML output to {fname}")
-                conv_res.document.save_as_html(
-                    filename=fname, image_mode=image_export_mode, split_page_view=True
-                )
+                if show_layout:
+                    ser = HTMLDocSerializer(
+                        doc=conv_res.document,
+                        params=HTMLParams(
+                            image_mode=image_export_mode,
+                            output_style=HTMLOutputStyle.SPLIT_PAGE,
+                        ),
+                    )
+                    visualizer = LayoutVisualizer()
+                    visualizer.params.show_label = False
+                    ser_res = ser.serialize(
+                        visualizer=visualizer,
+                    )
+                    with open(fname, "w") as fw:
+                        fw.write(ser_res.text)
+                else:
+                    conv_res.document.save_as_html(
+                        filename=fname,
+                        image_mode=image_export_mode,
+                        split_page_view=True,
+                    )
             # Export Text format:
             if export_txt:
@@ -250,6 +275,13 @@ def convert(  # noqa: C901
     to_formats: List[OutputFormat] = typer.Option(
         None, "--to", help="Specify output formats. Defaults to Markdown."
     ),
+    show_layout: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="If enabled, the page images will show the bounding-boxes of the items.",
+        ),
+    ] = False,
     headers: str = typer.Option(
         None,
         "--headers",
@@ -596,6 +628,7 @@ def convert(  # noqa: C901
             export_json=export_json,
             export_html=export_html,
             export_html_split_page=export_html_split_page,
+            show_layout=show_layout,
             export_md=export_md,
             export_txt=export_txt,
             export_doctags=export_doctags,

{docling-2.34.0 → docling-2.35.0}/docling/datamodel/document.py RENAMED Viewed

@@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
     ) -> Optional[InputFormat]:
         """Guess the input format of a document by checking part of its content."""
         input_format: Optional[InputFormat] = None
-        content_str = content.decode("utf-8")
         if mime == "application/xml":
+            content_str = content.decode("utf-8")
             match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
             if match_doctype:
                 xml_doctype = match_doctype.group()
@@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
                     input_format = InputFormat.XML_JATS
         elif mime == "text/plain":
+            content_str = content.decode("utf-8")
             if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                 input_format = InputFormat.XML_USPTO
@@ -411,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
             else:
                 return "application/xml"
-        if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
+        if re.match(
+            r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
+            content_str,
+            re.DOTALL,
+        ):
             return "text/html"
         p = re.compile(

{docling-2.34.0 → docling-2.35.0}/docling/models/layout_model.py RENAMED Viewed

@@ -185,13 +185,23 @@ class LayoutModel(BasePageModel):
                     ).postprocess()
                     # processed_clusters, processed_cells = clusters, page.cells
-                    conv_res.confidence.pages[page.page_no].layout_score = float(
-                        np.mean([c.confidence for c in processed_clusters])
-                    )
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )
-                    conv_res.confidence.pages[page.page_no].ocr_score = float(
-                        np.mean([c.confidence for c in processed_cells if c.from_ocr])
-                    )
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )
                     page.cells = processed_cells
                     page.predictions.layout = LayoutPrediction(

{docling-2.34.0 → docling-2.35.0}/docling/models/page_preprocessing_model.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import re
+import warnings
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
@@ -7,7 +8,7 @@ import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel
-from docling.datamodel.base_models import Page, ScoreValue
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel):
             score = self.rate_text_quality(c.text)
             text_scores.append(score)
-        conv_res.confidence.pages[page.page_no].parse_score = float(
-            np.nanquantile(
-                text_scores, q=0.10
-            )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
-        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
+            )
+            conv_res.confidence.pages[page.page_no].parse_score = float(
+                np.nanquantile(
+                    text_scores, q=0.10
+                )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
+            )
         # DEBUG code:
         def draw_text_boxes(image, cells, show: bool = False):

{docling-2.34.0 → docling-2.35.0}/docling/pipeline/standard_pdf_pipeline.py RENAMED Viewed

@@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
+from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
@@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
                 "When defined, it must point to a folder containing all models required by the pipeline."
             )
-        self.keep_images = (
-            self.pipeline_options.generate_page_images
-            or self.pipeline_options.generate_picture_images
-            or self.pipeline_options.generate_table_images
-        )
+        with warnings.catch_warnings():  # deprecated generate_table_images
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            self.keep_images = (
+                self.pipeline_options.generate_page_images
+                or self.pipeline_options.generate_picture_images
+                or self.pipeline_options.generate_table_images
+            )
         self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
@@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
                     )
             # Generate images of the requested element types
-            if (
-                self.pipeline_options.generate_picture_images
-                or self.pipeline_options.generate_table_images
-            ):
-                scale = self.pipeline_options.images_scale
-                for element, _level in conv_res.document.iterate_items():
-                    if not isinstance(element, DocItem) or len(element.prov) == 0:
-                        continue
-                    if (
-                        isinstance(element, PictureItem)
-                        and self.pipeline_options.generate_picture_images
-                    ) or (
-                        isinstance(element, TableItem)
-                        and self.pipeline_options.generate_table_images
-                    ):
-                        page_ix = element.prov[0].page_no - 1
-                        page = next(
-                            (p for p in conv_res.pages if p.page_no == page_ix),
-                            cast("Page", None),
-                        )
-                        assert page is not None
-                        assert page.size is not None
-                        assert page.image is not None
-                        crop_bbox = (
-                            element.prov[0]
-                            .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
-                        )
-                        cropped_im = page.image.crop(crop_bbox.as_tuple())
-                        element.image = ImageRef.from_pil(
-                            cropped_im, dpi=int(72 * scale)
-                        )
+            with warnings.catch_warnings():  # deprecated generate_table_images
+                warnings.filterwarnings("ignore", category=DeprecationWarning)
+                if (
+                    self.pipeline_options.generate_picture_images
+                    or self.pipeline_options.generate_table_images
+                ):
+                    scale = self.pipeline_options.images_scale
+                    for element, _level in conv_res.document.iterate_items():
+                        if not isinstance(element, DocItem) or len(element.prov) == 0:
+                            continue
+                        if (
+                            isinstance(element, PictureItem)
+                            and self.pipeline_options.generate_picture_images
+                        ) or (
+                            isinstance(element, TableItem)
+                            and self.pipeline_options.generate_table_images
+                        ):
+                            page_ix = element.prov[0].page_no - 1
+                            page = next(
+                                (p for p in conv_res.pages if p.page_no == page_ix),
+                                cast("Page", None),
+                            )
+                            assert page is not None
+                            assert page.size is not None
+                            assert page.image is not None
+                            crop_bbox = (
+                                element.prov[0]
+                                .bbox.scaled(scale=scale)
+                                .to_top_left_origin(
+                                    page_height=page.size.height * scale
+                                )
+                            )
+                            cropped_im = page.image.crop(crop_bbox.as_tuple())
+                            element.image = ImageRef.from_pil(
+                                cropped_im, dpi=int(72 * scale)
+                            )
             # Aggregate confidence values for document:
             if len(conv_res.pages) > 0:
-                conv_res.confidence.layout_score = float(
-                    np.nanmean(
-                        [c.layout_score for c in conv_res.confidence.pages.values()]
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        category=RuntimeWarning,
+                        message="Mean of empty slice|All-NaN slice encountered",
                     )
-                )
-                conv_res.confidence.parse_score = float(
-                    np.nanquantile(
-                        [c.parse_score for c in conv_res.confidence.pages.values()],
-                        q=0.1,  # parse score should relate to worst 10% of pages.
+                    conv_res.confidence.layout_score = float(
+                        np.nanmean(
+                            [c.layout_score for c in conv_res.confidence.pages.values()]
+                        )
                     )
-                )
-                conv_res.confidence.table_score = float(
-                    np.nanmean(
-                        [c.table_score for c in conv_res.confidence.pages.values()]
+                    conv_res.confidence.parse_score = float(
+                        np.nanquantile(
+                            [c.parse_score for c in conv_res.confidence.pages.values()],
+                            q=0.1,  # parse score should relate to worst 10% of pages.
+                        )
                     )
-                )
-                conv_res.confidence.ocr_score = float(
-                    np.nanmean(
-                        [c.ocr_score for c in conv_res.confidence.pages.values()]
+                    conv_res.confidence.table_score = float(
+                        np.nanmean(
+                            [c.table_score for c in conv_res.confidence.pages.values()]
+                        )
+                    )
+                    conv_res.confidence.ocr_score = float(
+                        np.nanmean(
+                            [c.ocr_score for c in conv_res.confidence.pages.values()]
+                        )
                     )
-                )
         return conv_res

{docling-2.34.0 → docling-2.35.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.34.0"  # DO NOT EDIT, updated automatically
+version = "2.35.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
   "Christoph Auer <cau@zurich.ibm.com>",
@@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {version = "^2.29.0", extras = ["chunking"]}
+docling-core = {version = "^2.31.2", extras = ["chunking"]}
 docling-ibm-models = "^3.4.0"
 docling-parse = "^4.0.0"
 filetype = "^1.2.0"

{docling-2.34.0 → docling-2.35.0}/LICENSE RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/README.md RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/abstract_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/asciidoc_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/csv_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/docling_parse_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/docling_parse_v2_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/docling_parse_v4_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/docx/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/docx/latex/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/docx/latex/latex_dict.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/docx/latex/omml.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/html_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/json/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/json/docling_json_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/md_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/msexcel_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/mspowerpoint_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/msword_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/pdf_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/pypdfium2_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/xml/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/xml/jats_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/backend/xml/uspto_backend.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/chunking/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/cli/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/cli/models.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/cli/tools.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/datamodel/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/datamodel/base_models.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/datamodel/pipeline_options.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/datamodel/settings.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/document_converter.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/exceptions.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/api_vlm_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/base_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/base_ocr_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/code_formula_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/document_picture_classifier.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/easyocr_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/factories/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/factories/base_factory.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/factories/ocr_factory.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/factories/picture_description_factory.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/hf_mlx_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/hf_vlm_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/ocr_mac_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/page_assemble_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/picture_description_api_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/picture_description_base_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/picture_description_vlm_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/plugins/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/plugins/defaults.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/rapid_ocr_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/readingorder_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/table_structure_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/tesseract_ocr_cli_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/models/tesseract_ocr_model.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/pipeline/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/pipeline/base_pipeline.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/pipeline/simple_pipeline.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/pipeline/vlm_pipeline.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/py.typed RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/__init__.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/accelerator_utils.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/api_image_request.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/export.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/glm_utils.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/layout_postprocessor.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/locks.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/model_downloader.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/ocr_utils.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/orientation.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/profiling.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/utils.py RENAMED Viewed

File without changes

{docling-2.34.0 → docling-2.35.0}/docling/utils/visualization.py RENAMED Viewed

File without changes

docling 2.34.0__tar.gz → 2.35.0__tar.gz

docling 2.34.0tar.gz → 2.35.0tar.gz