PyPI - docling - Versions diffs - 2.23.1__py3-none-any.whl → 2.25.0__py3-none-any.whl - Mend

docling 2.23.1py3-none-any.whl → 2.25.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

docling/backend/html_backend.py +42 -3
docling/cli/models.py +28 -4
docling/datamodel/base_models.py +5 -0
docling/datamodel/pipeline_options.py +62 -1
docling/models/hf_vlm_model.py +180 -0
docling/models/page_assemble_model.py +8 -0
docling/models/picture_description_vlm_model.py +2 -2
docling/models/readingorder_model.py +389 -0
docling/pipeline/standard_pdf_pipeline.py +2 -2
docling/pipeline/vlm_pipeline.py +534 -0
docling/utils/model_downloader.py +15 -2
docling/utils/visualization.py +5 -0
{docling-2.23.1.dist-info → docling-2.25.0.dist-info}/METADATA +3 -3
{docling-2.23.1.dist-info → docling-2.25.0.dist-info}/RECORD +17 -15
docling/models/ds_glm_model.py +0 -386
{docling-2.23.1.dist-info → docling-2.25.0.dist-info}/LICENSE +0 -0
{docling-2.23.1.dist-info → docling-2.25.0.dist-info}/WHEEL +0 -0
{docling-2.23.1.dist-info → docling-2.25.0.dist-info}/entry_points.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Optional, Union, cast
+from typing import Final, Optional, Union, cast
 from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
+from bs4.element import PreformattedString
 from docling_core.types.doc import (
     DocItem,
     DocItemLabel,
@@ -22,12 +23,29 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
+# tags that generate NodeItem elements
+TAGS_FOR_NODE_ITEMS: Final = [
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "p",
+    "pre",
+    "ul",
+    "ol",
+    "li",
+    "table",
+    "figure",
+    "img",
+]
 class HTMLDocumentBackend(DeclarativeDocumentBackend):
     @override
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
-        _log.debug("About to init HTML backend...")
         self.soup: Optional[Tag] = None
         # HTML file:
         self.path_or_stream = path_or_stream
@@ -88,6 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             assert self.soup is not None
             content = self.soup.body or self.soup
             # Replace <br> tags with newline characters
+            # TODO: remove style to avoid losing text from tags like i, b, span, ...
             for br in content("br"):
                 br.replace_with(NavigableString("\n"))
             self.walk(content, doc)
@@ -99,6 +118,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
     def walk(self, tag: Tag, doc: DoclingDocument) -> None:
         # Iterate over elements in the body of the document
+        text: str = ""
         for element in tag.children:
             if isinstance(element, Tag):
                 try:
@@ -108,6 +128,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         f"Error processing child from tag{tag.name}: {exc_child}"
                     )
                     raise exc_child
+            elif isinstance(element, NavigableString) and not isinstance(
+                element, PreformattedString
+            ):
+                # Floating text outside paragraphs or analyzed tags
+                text += element
+                siblings: list[Tag] = [
+                    item for item in element.next_siblings if isinstance(item, Tag)
+                ]
+                if element.next_sibling is None or any(
+                    [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
+                ):
+                    text = text.strip()
+                    if text and tag.name in ["div"]:
+                        doc.add_text(
+                            parent=self.parents[self.level],
+                            label=DocItemLabel.PARAGRAPH,
+                            text=text,
+                        )
+                    text = ""
         return
@@ -158,7 +197,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         text = element.text.strip()
         if hlevel == 1:
-            for key, val in self.parents.items():
+            for key in self.parents.keys():
                 self.parents[key] = None
             self.level = 1

docling/cli/models.py CHANGED Viewed

@@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
     CODE_FORMULA = "code_formula"
     PICTURE_CLASSIFIER = "picture_classifier"
     SMOLVLM = "smolvlm"
+    GRANITE_VISION = "granite_vision"
     EASYOCR = "easyocr"
+_default_models = [
+    _AvailableModels.LAYOUT,
+    _AvailableModels.TABLEFORMER,
+    _AvailableModels.CODE_FORMULA,
+    _AvailableModels.PICTURE_CLASSIFIER,
+    _AvailableModels.EASYOCR,
+]
 @app.command("download")
 def download(
     output_dir: Annotated[
@@ -43,18 +53,27 @@ def download(
             ...,
             "-o",
             "--output-dir",
-            help="The directory where all the models are downloaded.",
+            help="The directory where to download the models.",
         ),
     ] = (settings.cache_dir / "models"),
     force: Annotated[
-        bool, typer.Option(..., help="If true, the download will be forced")
+        bool, typer.Option(..., help="If true, the download will be forced.")
     ] = False,
     models: Annotated[
         Optional[list[_AvailableModels]],
         typer.Argument(
-            help=f"Models to download (default behavior: all will be downloaded)",
+            help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
         ),
     ] = None,
+    all: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            "--all",
+            help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
+            show_default=True,
+        ),
+    ] = False,
     quiet: Annotated[
         bool,
         typer.Option(
@@ -65,6 +84,10 @@ def download(
         ),
     ] = False,
 ):
+    if models and all:
+        raise typer.BadParameter(
+            "Cannot simultaneously set 'all' parameter and specify models to download."
+        )
     if not quiet:
         FORMAT = "%(message)s"
         logging.basicConfig(
@@ -73,7 +96,7 @@ def download(
             datefmt="[%X]",
             handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
         )
-    to_download = models or [m for m in _AvailableModels]
+    to_download = models or ([m for m in _AvailableModels] if all else _default_models)
     output_dir = download_models(
         output_dir=output_dir,
         force=force,
@@ -83,6 +106,7 @@ def download(
         with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
         with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
         with_smolvlm=_AvailableModels.SMOLVLM in to_download,
+        with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
         with_easyocr=_AvailableModels.EASYOCR in to_download,
     )

docling/datamodel/base_models.py CHANGED Viewed

@@ -154,6 +154,10 @@ class LayoutPrediction(BaseModel):
     clusters: List[Cluster] = []
+class VlmPrediction(BaseModel):
+    text: str = ""
 class ContainerElement(
     BasePageElement
 ):  # Used for Form and Key-Value-Regions, only for typing.
@@ -197,6 +201,7 @@ class PagePredictions(BaseModel):
     tablestructure: Optional[TableStructurePrediction] = None
     figures_classification: Optional[FigureClassificationPrediction] = None
     equations_prediction: Optional[EquationPrediction] = None
+    vlm_response: Optional[VlmPrediction] = None
 PageElement = Union[TextElement, Table, FigureElement, ContainerElement]

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -41,6 +41,7 @@ class AcceleratorOptions(BaseSettings):
     num_threads: int = 4
     device: Union[str, AcceleratorDevice] = "auto"
+    cuda_use_flash_attention2: bool = False
     @field_validator("device")
     def validate_device(cls, value):
@@ -254,6 +255,45 @@ granite_picture_description = PictureDescriptionVlmOptions(
 )
+class BaseVlmOptions(BaseModel):
+    kind: str
+    prompt: str
+class ResponseFormat(str, Enum):
+    DOCTAGS = "doctags"
+    MARKDOWN = "markdown"
+class HuggingFaceVlmOptions(BaseVlmOptions):
+    kind: Literal["hf_model_options"] = "hf_model_options"
+    repo_id: str
+    load_in_8bit: bool = True
+    llm_int8_threshold: float = 6.0
+    quantized: bool = False
+    response_format: ResponseFormat
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
+    repo_id="ds4sd/SmolDocling-256M-preview",
+    prompt="Convert this page to docling.",
+    response_format=ResponseFormat.DOCTAGS,
+)
+granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
+    # prompt="OCR the full page to markdown.",
+    prompt="OCR this image.",
+    response_format=ResponseFormat.MARKDOWN,
+)
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
@@ -285,7 +325,24 @@ class PipelineOptions(BaseModel):
     enable_remote_services: bool = False
-class PdfPipelineOptions(PipelineOptions):
+class PaginatedPipelineOptions(PipelineOptions):
+    images_scale: float = 1.0
+    generate_page_images: bool = False
+    generate_picture_images: bool = False
+class VlmPipelineOptions(PaginatedPipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
+    generate_page_images: bool = True
+    force_backend_text: bool = (
+        False  # (To be used with vlms, or other generative models)
+    )
+    # If True, text from backend will be used instead of generated text
+    vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
+class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
     artifacts_path: Optional[Union[Path, str]] = None
@@ -295,6 +352,10 @@ class PdfPipelineOptions(PipelineOptions):
     do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
     do_picture_classification: bool = False  # True: classify pictures in documents
     do_picture_description: bool = False  # True: run describe pictures in documents
+    force_backend_text: bool = (
+        False  # (To be used with vlms, or other generative models)
+    )
+    # If True, text from backend will be used instead of generated text
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[

docling/models/hf_vlm_model.py ADDED Viewed

@@ -0,0 +1,180 @@
+import logging
+import time
+from pathlib import Path
+from typing import Iterable, List, Optional
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    HuggingFaceVlmOptions,
+)
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+_log = logging.getLogger(__name__)
+class HuggingFaceVlmModel(BasePageModel):
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        vlm_options: HuggingFaceVlmOptions,
+    ):
+        self.enabled = enabled
+        self.vlm_options = vlm_options
+        if self.enabled:
+            import torch
+            from transformers import (  # type: ignore
+                AutoModelForVision2Seq,
+                AutoProcessor,
+                BitsAndBytesConfig,
+            )
+            device = decide_device(accelerator_options.device)
+            self.device = device
+            _log.debug("Available device for HuggingFace VLM: {}".format(device))
+            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
+            # PARAMETERS:
+            if artifacts_path is None:
+                artifacts_path = self.download_models(self.vlm_options.repo_id)
+            elif (artifacts_path / repo_cache_folder).exists():
+                artifacts_path = artifacts_path / repo_cache_folder
+            self.param_question = vlm_options.prompt  # "Perform Layout Analysis."
+            self.param_quantization_config = BitsAndBytesConfig(
+                load_in_8bit=vlm_options.load_in_8bit,  # True,
+                llm_int8_threshold=vlm_options.llm_int8_threshold,  # 6.0
+            )
+            self.param_quantized = vlm_options.quantized  # False
+            self.processor = AutoProcessor.from_pretrained(artifacts_path)
+            if not self.param_quantized:
+                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
+                    artifacts_path,
+                    device_map=device,
+                    torch_dtype=torch.bfloat16,
+                    _attn_implementation=(
+                        "flash_attention_2"
+                        if self.device.startswith("cuda")
+                        and accelerator_options.cuda_use_flash_attention2
+                        else "eager"
+                    ),
+                )  # .to(self.device)
+            else:
+                self.vlm_model = AutoModelForVision2Seq.from_pretrained(
+                    artifacts_path,
+                    device_map=device,
+                    torch_dtype="auto",
+                    quantization_config=self.param_quantization_config,
+                    _attn_implementation=(
+                        "flash_attention_2"
+                        if self.device.startswith("cuda")
+                        and accelerator_options.cuda_use_flash_attention2
+                        else "eager"
+                    ),
+                )  # .to(self.device)
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=repo_id,
+            force_download=force,
+            local_dir=local_dir,
+            # revision="v0.0.1",
+        )
+        return Path(download_path)
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "vlm"):
+                    assert page.size is not None
+                    hi_res_image = page.get_image(scale=2.0)  # 144dpi
+                    # hi_res_image = page.get_image(scale=1.0)  # 72dpi
+                    if hi_res_image is not None:
+                        im_width, im_height = hi_res_image.size
+                    # populate page_tags with predicted doc tags
+                    page_tags = ""
+                    if hi_res_image:
+                        if hi_res_image.mode != "RGB":
+                            hi_res_image = hi_res_image.convert("RGB")
+                    messages = [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "This is a page from a document.",
+                                },
+                                {"type": "image"},
+                                {"type": "text", "text": self.param_question},
+                            ],
+                        }
+                    ]
+                    prompt = self.processor.apply_chat_template(
+                        messages, add_generation_prompt=False
+                    )
+                    inputs = self.processor(
+                        text=prompt, images=[hi_res_image], return_tensors="pt"
+                    )
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                    start_time = time.time()
+                    # Call model to generate:
+                    generated_ids = self.vlm_model.generate(
+                        **inputs, max_new_tokens=4096, use_cache=True
+                    )
+                    generation_time = time.time() - start_time
+                    generated_texts = self.processor.batch_decode(
+                        generated_ids[:, inputs["input_ids"].shape[1] :],
+                        skip_special_tokens=False,
+                    )[0]
+                    num_tokens = len(generated_ids[0])
+                    page_tags = generated_texts
+                    # inference_time = time.time() - start_time
+                    # tokens_per_second = num_tokens / generation_time
+                    # print("")
+                    # print(f"Page Inference Time: {inference_time:.2f} seconds")
+                    # print(f"Total tokens on page: {num_tokens:.2f}")
+                    # print(f"Tokens/sec: {tokens_per_second:.2f}")
+                    # print("")
+                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                yield page

docling/models/page_assemble_model.py CHANGED Viewed

@@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
         sanitized_text = "".join(lines)
+        # Text normalization
+        sanitized_text = sanitized_text.replace("⁄", "/")
+        sanitized_text = sanitized_text.replace("’", "'")
+        sanitized_text = sanitized_text.replace("‘", "'")
+        sanitized_text = sanitized_text.replace("“", '"')
+        sanitized_text = sanitized_text.replace("”", '"')
+        sanitized_text = sanitized_text.replace("•", "·")
         return sanitized_text.strip()  # Strip any leading or trailing whitespace
     def __call__(

docling/models/picture_description_vlm_model.py CHANGED Viewed

@@ -41,9 +41,9 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
                 )
             # Initialize processor and model
-            self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
+            self.processor = AutoProcessor.from_pretrained(artifacts_path)
             self.model = AutoModelForVision2Seq.from_pretrained(
-                self.options.repo_id,
+                artifacts_path,
                 torch_dtype=torch.bfloat16,
                 _attn_implementation=(
                     "flash_attention_2" if self.device.startswith("cuda") else "eager"

docling 2.23.1__py3-none-any.whl → 2.25.0__py3-none-any.whl

docling 2.23.1py3-none-any.whl → 2.25.0py3-none-any.whl