PyPI - docling - Versions diffs - 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl - Mend

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

docling/backend/abstract_backend.py +0 -1
docling/backend/asciidoc_backend.py +0 -1
docling/backend/docling_parse_backend.py +1 -1
docling/backend/docling_parse_v2_backend.py +1 -1
docling/backend/html_backend.py +4 -3
docling/backend/json/__init__.py +0 -0
docling/backend/json/docling_json_backend.py +58 -0
docling/backend/md_backend.py +49 -36
docling/backend/msexcel_backend.py +50 -38
docling/backend/msword_backend.py +0 -1
docling/backend/pdf_backend.py +0 -2
docling/backend/pypdfium2_backend.py +1 -1
docling/backend/xml/uspto_backend.py +25 -25
docling/cli/main.py +18 -3
docling/datamodel/base_models.py +30 -3
docling/datamodel/document.py +4 -0
docling/datamodel/pipeline_options.py +7 -9
docling/document_converter.py +4 -0
docling/models/base_model.py +62 -6
docling/models/code_formula_model.py +245 -0
docling/models/document_picture_classifier.py +187 -0
docling/models/layout_model.py +10 -86
docling/models/page_assemble_model.py +1 -33
docling/models/rapid_ocr_model.py +1 -0
docling/models/tesseract_ocr_cli_model.py +72 -5
docling/models/tesseract_ocr_model.py +68 -20
docling/pipeline/base_pipeline.py +40 -17
docling/pipeline/standard_pdf_pipeline.py +31 -2
docling/utils/glm_utils.py +4 -1
docling/utils/ocr_utils.py +9 -0
docling/utils/visualization.py +80 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
docling-2.17.0.dist-info/RECORD +62 -0
docling-2.15.1.dist-info/RECORD +0 -56
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
{docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0

docling/backend/xml/uspto_backend.py CHANGED Viewed

@@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
             if name == self.Element.TITLE.value:
                 if text:
                     self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         text=text,
                     )
                     self.level += 1
@@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
                     abstract_item = self.doc.add_heading(
                         heading_text,
                         level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                     )
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
@@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
                         text=text,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                 self.text = ""
@@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
                 self.parents[self.level + 1] = self.doc.add_heading(
                     text=text,
                     level=self.level,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
                 self.level += 1
                 self.text = ""
@@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         def _apply_style(self, text: str, style_tag: str) -> str:
@@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 if self.Element.TITLE.value in self.property and text.strip():
                     title = text.strip()
                     self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         text=title,
                     )
                     self.level += 1
@@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
                     self.parents[self.level + 1] = self.doc.add_heading(
                         text=text.strip(),
                         level=self.level,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                     self.level += 1
@@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 abstract_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
                         text=paragraph,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                 elif self.Element.CLAIM.value in self.property:
                     # we may need a space after a paragraph in claim text
@@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         def _apply_style(self, text: str, style_tag: str) -> str:
@@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
         self.parents[self.level + 1] = self.doc.add_heading(
             heading.value,
             level=self.level,
-            parent=self.parents[self.level],  # type: ignore[arg-type]
+            parent=self.parents[self.level],
         )
         self.level += 1
@@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
         if field == self.Field.TITLE.value:
             self.parents[self.level + 1] = self.doc.add_title(
-                parent=self.parents[self.level], text=value  # type: ignore[arg-type]
+                parent=self.parents[self.level], text=value
             )
             self.level += 1
@@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
                 self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH,
                     text=value,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
             self.doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 text="",
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
         elif (
@@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
                 last_claim = self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH,
                     text="",
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
             last_claim.text += f" {value}" if last_claim.text else value
@@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
             self.parents[self.level + 1] = self.doc.add_heading(
                 value,
                 level=self.level,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
             self.level += 1
@@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
             self.doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 text=value,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
     def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
                 title = text.strip()
                 if title:
                     self.parents[self.level + 1] = self.doc.add_text(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         label=DocItemLabel.TITLE,
                         text=title,
                     )
@@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
                     abstract_item = self.doc.add_heading(
                         heading_text,
                         level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                     )
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
@@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
                         self.parents[self.level + 1] = self.doc.add_heading(
                             text=text,
                             level=self.level,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                         )
                         self.level += 1
                     else:
                         self.doc.add_text(
                             label=DocItemLabel.PARAGRAPH,
                             text=text,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                         )
                 self.text = ""
@@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         def _apply_style(self, text: str, style_tag: str) -> str:

docling/cli/main.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import importlib
-import json
 import logging
+import platform
 import re
+import sys
 import tempfile
 import time
 import warnings
-from enum import Enum
 from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
-from pydantic import TypeAdapter, ValidationError
+from pydantic import TypeAdapter
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -65,10 +65,15 @@ def version_callback(value: bool):
         docling_core_version = importlib.metadata.version("docling-core")
         docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
         docling_parse_version = importlib.metadata.version("docling-parse")
+        platform_str = platform.platform()
+        py_impl_version = sys.implementation.cache_tag
+        py_lang_version = platform.python_version()
         print(f"Docling version: {docling_version}")
         print(f"Docling Core version: {docling_core_version}")
         print(f"Docling IBM Models version: {docling_ibm_models_version}")
         print(f"Docling Parse version: {docling_parse_version}")
+        print(f"Python: {py_impl_version} ({py_lang_version})")
+        print(f"Platform: {platform_str}")
         raise typer.Exit()
@@ -206,6 +211,14 @@ def convert(
         TableFormerMode,
         typer.Option(..., help="The mode to use in the table structure model."),
     ] = TableFormerMode.FAST,
+    enrich_code: Annotated[
+        bool,
+        typer.Option(..., help="Enable the code enrichment model in the pipeline."),
+    ] = False,
+    enrich_formula: Annotated[
+        bool,
+        typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
+    ] = False,
     artifacts_path: Annotated[
         Optional[Path],
         typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -360,6 +373,8 @@ def convert(
             do_ocr=ocr,
             ocr_options=ocr_options,
             do_table_structure=True,
+            do_code_enrichment=enrich_code,
+            do_formula_enrichment=enrich_formula,
             document_timeout=document_timeout,
         )
         pipeline_options.table_structure_options.do_cell_matching = (

docling/datamodel/base_models.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Union
 from docling_core.types.doc import (
     BoundingBox,
     DocItemLabel,
+    NodeItem,
     PictureDataType,
     Size,
     TableCell,
@@ -40,6 +41,7 @@ class InputFormat(str, Enum):
     MD = "md"
     XLSX = "xlsx"
     XML_USPTO = "xml_uspto"
+    JSON_DOCLING = "json_docling"
 class OutputFormat(str, Enum):
@@ -61,6 +63,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.XLSX: ["xlsx"],
     InputFormat.XML_USPTO: ["xml", "txt"],
+    InputFormat.JSON_DOCLING: ["json"],
 }
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -89,6 +92,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
     InputFormat.XML_USPTO: ["application/xml", "text/plain"],
+    InputFormat.JSON_DOCLING: ["application/json"],
 }
 MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -201,6 +205,13 @@ class AssembledUnit(BaseModel):
     headers: List[PageElement] = []
+class ItemAndImageEnrichmentElement(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    item: NodeItem
+    image: Image
 class Page(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -219,12 +230,28 @@ class Page(BaseModel):
         {}
     )  # Cache of images in different scales. By default it is cleared during assembling.
-    def get_image(self, scale: float = 1.0) -> Optional[Image]:
+    def get_image(
+        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+    ) -> Optional[Image]:
         if self._backend is None:
             return self._image_cache.get(scale, None)
         if not scale in self._image_cache:
-            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
-        return self._image_cache[scale]
+            if cropbox is None:
+                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
+            else:
+                return self._backend.get_page_image(scale=scale, cropbox=cropbox)
+        if cropbox is None:
+            return self._image_cache[scale]
+        else:
+            page_im = self._image_cache[scale]
+            assert self.size is not None
+            return page_im.crop(
+                cropbox.to_top_left_origin(page_height=self.size.height)
+                .scaled(scale=scale)
+                .as_tuple()
+            )
     @property
     def image(self) -> Optional[Image]:

docling/datamodel/document.py CHANGED Viewed

@@ -350,6 +350,10 @@ class _DocumentConversionInput(BaseModel):
             mime = FormatToMimeType[InputFormat.HTML][0]
         elif ext in FormatToExtensions[InputFormat.MD]:
             mime = FormatToMimeType[InputFormat.MD][0]
+        elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
+            mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
+        elif ext in FormatToExtensions[InputFormat.PDF]:
+            mime = FormatToMimeType[InputFormat.PDF][0]
         return mime
     @staticmethod

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -1,17 +1,11 @@
 import logging
 import os
-import warnings
 from enum import Enum
 from pathlib import Path
-from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing import Any, List, Literal, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-from pydantic_settings import (
-    BaseSettings,
-    PydanticBaseSettingsSource,
-    SettingsConfigDict,
-)
-from typing_extensions import deprecated
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
 _log = logging.getLogger(__name__)
@@ -125,6 +119,7 @@ class RapidOcrOptions(OcrOptions):
     det_model_path: Optional[str] = None  # same default as rapidocr
     cls_model_path: Optional[str] = None  # same default as rapidocr
     rec_model_path: Optional[str] = None  # same default as rapidocr
+    rec_keys_path: Optional[str] = None  # same default as rapidocr
     model_config = ConfigDict(
         extra="forbid",
@@ -225,6 +220,9 @@ class PdfPipelineOptions(PipelineOptions):
     artifacts_path: Optional[Union[Path, str]] = None
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+    do_code_enrichment: bool = False  # True: perform code OCR
+    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
+    do_picture_classification: bool = False  # True: classify pictures in documents
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[

docling/document_converter.py CHANGED Viewed

@@ -11,6 +11,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.json.docling_json_backend import DoclingJSONBackend
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
@@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.PDF: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
+        InputFormat.JSON_DOCLING: FormatOption(
+            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
+        ),
     }
     if (options := format_to_default_options.get(format)) is not None:
         return options

docling/models/base_model.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Any, Iterable
+from typing import Any, Generic, Iterable, Optional
-from docling_core.types.doc import DoclingDocument, NodeItem
+from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
+from typing_extensions import TypeVar
-from docling.datamodel.base_models import Page
+from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
 from docling.datamodel.document import ConversionResult
@@ -15,14 +16,69 @@ class BasePageModel(ABC):
         pass
-class BaseEnrichmentModel(ABC):
+EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
+class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
     @abstractmethod
     def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
         pass
+    @abstractmethod
+    def prepare_element(
+        self, conv_res: ConversionResult, element: NodeItem
+    ) -> Optional[EnrichElementT]:
+        pass
     @abstractmethod
     def __call__(
-        self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
-    ) -> Iterable[Any]:
+        self, doc: DoclingDocument, element_batch: Iterable[EnrichElementT]
+    ) -> Iterable[NodeItem]:
         pass
+class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
+    def prepare_element(
+        self, conv_res: ConversionResult, element: NodeItem
+    ) -> Optional[NodeItem]:
+        if self.is_processable(doc=conv_res.document, element=element):
+            return element
+        return None
+class BaseItemAndImageEnrichmentModel(
+    GenericEnrichmentModel[ItemAndImageEnrichmentElement]
+):
+    images_scale: float
+    expansion_factor: float = 0.0
+    def prepare_element(
+        self, conv_res: ConversionResult, element: NodeItem
+    ) -> Optional[ItemAndImageEnrichmentElement]:
+        if not self.is_processable(doc=conv_res.document, element=element):
+            return None
+        assert isinstance(element, TextItem)
+        element_prov = element.prov[0]
+        bbox = element_prov.bbox
+        width = bbox.r - bbox.l
+        height = bbox.t - bbox.b
+        # TODO: move to a utility in the BoundingBox class
+        expanded_bbox = BoundingBox(
+            l=bbox.l - width * self.expansion_factor,
+            t=bbox.t + height * self.expansion_factor,
+            r=bbox.r + width * self.expansion_factor,
+            b=bbox.b - height * self.expansion_factor,
+            coord_origin=bbox.coord_origin,
+        )
+        page_ix = element_prov.page_no - 1
+        cropped_image = conv_res.pages[page_ix].get_image(
+            scale=self.images_scale, cropbox=expanded_bbox
+        )
+        return ItemAndImageEnrichmentElement(item=element, image=cropped_image)

docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl

docling 2.15.1py3-none-any.whl → 2.17.0py3-none-any.whl