PyPI - docling - Versions diffs - 2.30.0__py3-none-any.whl → 2.31.0__py3-none-any.whl - Mend

docling 2.30.0py3-none-any.whl → 2.31.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

docling/backend/asciidoc_backend.py +7 -15
docling/backend/csv_backend.py +1 -1
docling/backend/docling_parse_backend.py +2 -2
docling/backend/docling_parse_v2_backend.py +2 -2
docling/backend/docling_parse_v4_backend.py +3 -4
docling/backend/docx/latex/latex_dict.py +0 -5
docling/backend/docx/latex/omml.py +4 -7
docling/backend/html_backend.py +26 -9
docling/backend/md_backend.py +5 -7
docling/backend/msexcel_backend.py +1 -7
docling/backend/mspowerpoint_backend.py +4 -7
docling/backend/msword_backend.py +4 -4
docling/backend/pdf_backend.py +2 -1
docling/backend/pypdfium2_backend.py +3 -3
docling/backend/xml/jats_backend.py +10 -13
docling/backend/xml/uspto_backend.py +15 -19
docling/cli/main.py +7 -7
docling/cli/models.py +2 -3
docling/datamodel/base_models.py +7 -5
docling/datamodel/document.py +11 -10
docling/datamodel/pipeline_options.py +0 -1
docling/document_converter.py +5 -5
docling/models/api_vlm_model.py +1 -2
docling/models/base_model.py +2 -4
docling/models/base_ocr_model.py +2 -2
docling/models/code_formula_model.py +2 -1
docling/models/document_picture_classifier.py +2 -1
docling/models/easyocr_model.py +10 -11
docling/models/factories/__init__.py +2 -2
docling/models/factories/base_factory.py +1 -1
docling/models/hf_mlx_model.py +4 -6
docling/models/hf_vlm_model.py +7 -5
docling/models/layout_model.py +2 -2
docling/models/ocr_mac_model.py +3 -4
docling/models/page_assemble_model.py +7 -12
docling/models/page_preprocessing_model.py +2 -1
docling/models/picture_description_api_model.py +2 -1
docling/models/picture_description_base_model.py +2 -3
docling/models/picture_description_vlm_model.py +2 -3
docling/models/rapid_ocr_model.py +2 -3
docling/models/readingorder_model.py +8 -23
docling/models/table_structure_model.py +2 -6
docling/models/tesseract_ocr_cli_model.py +17 -16
docling/models/tesseract_ocr_model.py +8 -6
docling/pipeline/base_pipeline.py +4 -8
docling/pipeline/simple_pipeline.py +0 -1
docling/pipeline/standard_pdf_pipeline.py +0 -1
docling/pipeline/vlm_pipeline.py +0 -3
docling/utils/export.py +2 -4
docling/utils/glm_utils.py +2 -2
docling/utils/layout_postprocessor.py +4 -2
docling/utils/model_downloader.py +7 -7
docling/utils/utils.py +1 -1
{docling-2.30.0.dist-info → docling-2.31.0.dist-info}/METADATA +2 -1
docling-2.31.0.dist-info/RECORD +86 -0
docling-2.30.0.dist-info/RECORD +0 -86
{docling-2.30.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
{docling-2.30.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
{docling-2.30.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0

docling/models/hf_mlx_model.py CHANGED Viewed

@@ -1,25 +1,22 @@
 import logging
 import time
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Optional
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
     AcceleratorOptions,
     HuggingFaceVlmOptions,
 )
-from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
-from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class HuggingFaceMlxModel(BasePageModel):
     def __init__(
         self,
         enabled: bool,
@@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
         self.vlm_options = vlm_options
         if self.enabled:
             try:
                 from mlx_vlm import generate, load  # type: ignore
                 from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
@@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
                     generation_time = time.time() - start_time
                     page_tags = output
+                    _log.debug(f"Generation time {generation_time:.2f} seconds.")
                     # inference_time = time.time() - start_time
                     # tokens_per_second = num_tokens / generation_time
                     # print("")

docling/models/hf_vlm_model.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import logging
 import time
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Optional
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
     AcceleratorOptions,
     HuggingFaceVlmOptions,
 )
-from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
@@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
 class HuggingFaceVlmModel(BasePageModel):
     def __init__(
         self,
         enabled: bool,
@@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
             device = decide_device(accelerator_options.device)
             self.device = device
-            _log.debug("Available device for HuggingFace VLM: {}".format(device))
+            _log.debug(f"Available device for HuggingFace VLM: {device}")
             repo_cache_folder = vlm_options.repo_id.replace("/", "--")
@@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
                     num_tokens = len(generated_ids[0])
                     page_tags = generated_texts
+                    _log.debug(
+                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
+                    )
                     # inference_time = time.time() - start_time
                     # tokens_per_second = num_tokens / generation_time
                     # print("")

docling/models/layout_model.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import copy
 import logging
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Optional
 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():

docling/models/ocr_mac_model.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
 import sys
 import tempfile
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Tuple, Type
+from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
         if self.enabled:
             if "darwin" != sys.platform:
-                raise RuntimeError(f"OcrMac is only supported on Mac.")
+                raise RuntimeError("OcrMac is only supported on Mac.")
             install_errmsg = (
                 "ocrmac is not correctly installed. "
                 "Please install it via `pip install ocrmac` to use this OCR engine. "
@@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
@@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
                 yield page
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []

docling/models/page_assemble_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import re
-from typing import Iterable, List
+from collections.abc import Iterable
+from typing import List
 from pydantic import BaseModel
@@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
         sanitized_text = "".join(lines)
         # Text normalization
-        sanitized_text = sanitized_text.replace("⁄", "/")
-        sanitized_text = sanitized_text.replace("’", "'")
-        sanitized_text = sanitized_text.replace("‘", "'")
+        sanitized_text = sanitized_text.replace("⁄", "/")  # noqa: RUF001
+        sanitized_text = sanitized_text.replace("’", "'")  # noqa: RUF001
+        sanitized_text = sanitized_text.replace("‘", "'")  # noqa: RUF001
         sanitized_text = sanitized_text.replace("“", '"')
         sanitized_text = sanitized_text.replace("”", '"')
         sanitized_text = sanitized_text.replace("•", "·")
@@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
                 yield page
             else:
                 with TimeRecorder(conv_res, "page_assemble"):
                     assert page.predictions.layout is not None
                     # assembles some JSON output page by page.
@@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
                     for cluster in page.predictions.layout.clusters:
                         # _log.info("Cluster label seen:", cluster.label)
                         if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
                             textlines = [
                                 cell.text.replace("\x02", "-").strip()
                                 for cell in cluster.cells
@@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
                                 tbl = page.predictions.tablestructure.table_map.get(
                                     cluster.id, None
                                 )
-                            if (
-                                not tbl
-                            ):  # fallback: add table without structure, if it isn't present
+                            if not tbl:  # fallback: add table without structure, if it isn't present
                                 tbl = Table(
                                     label=cluster.label,
                                     id=cluster.id,
@@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
                                 fig = page.predictions.figures_classification.figure_map.get(
                                     cluster.id, None
                                 )
-                            if (
-                                not fig
-                            ):  # fallback: add figure without classification, if it isn't present
+                            if not fig:  # fallback: add figure without classification, if it isn't present
                                 fig = FigureElement(
                                     label=cluster.label,
                                     id=cluster.id,

docling/models/page_preprocessing_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional
+from typing import Optional
 from PIL import ImageDraw
 from pydantic import BaseModel

docling/models/picture_description_api_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Optional, Type, Union
 from PIL import Image

docling/models/picture_description_base_model.py CHANGED Viewed

@@ -1,12 +1,11 @@
-import logging
 from abc import abstractmethod
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable, List, Optional, Type, Union
+from typing import List, Optional, Type, Union
 from docling_core.types.doc import (
     DoclingDocument,
     NodeItem,
-    PictureClassificationClass,
     PictureItem,
 )
 from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc

docling/models/picture_description_vlm_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Optional, Type, Union
 from PIL import Image
@@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
 class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
     @classmethod
     def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
         return PictureDescriptionVlmOptions
@@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
         self.options: PictureDescriptionVlmOptions
         if self.enabled:
             if artifacts_path is None:
                 artifacts_path = self.download_models(repo_id=self.options.repo_id)
             else:

docling/models/rapid_ocr_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Optional, Type
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():
                 yield page

docling/models/readingorder_model.py CHANGED Viewed

@@ -1,12 +1,7 @@
-import copy
-import random
 from pathlib import Path
 from typing import Dict, List
 from docling_core.types.doc import (
-    BoundingBox,
-    CoordOrigin,
-    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
     TableData,
 )
 from docling_core.types.doc.document import ContentLayer
-from docling_core.types.legacy_doc.base import Ref
-from docling_core.types.legacy_doc.document import BaseText
 from docling_ibm_models.reading_order.reading_order_rb import (
     PageElement as ReadingOrderPageElement,
+    ReadingOrderPredictor,
 )
-from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
-from PIL import ImageDraw
 from pydantic import BaseModel, ConfigDict
 from docling.datamodel.base_models import (
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
     TextElement,
 )
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.settings import settings
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -53,12 +44,10 @@ class ReadingOrderModel:
     def _assembled_to_readingorder_elements(
         self, conv_res: ConversionResult
     ) -> List[ReadingOrderPageElement]:
         elements: List[ReadingOrderPageElement] = []
         page_no_to_pages = {p.page_no: p for p in conv_res.pages}
         for element in conv_res.assembled.elements:
             page_height = page_no_to_pages[element.page_no].size.height  # type: ignore
             bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
             text = element.text or ""
@@ -84,7 +73,6 @@ class ReadingOrderModel:
     def _add_child_elements(
         self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
     ):
         child: Cluster
         for child in element.cluster.children:
             c_label = child.label
@@ -110,7 +98,7 @@ class ReadingOrderModel:
             else:
                 doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
-    def _readingorder_elements_to_docling_doc(
+    def _readingorder_elements_to_docling_doc(  # noqa: C901
         self,
         conv_res: ConversionResult,
         ro_elements: List[ReadingOrderPageElement],
@@ -118,7 +106,6 @@ class ReadingOrderModel:
         el_to_footnotes_mapping: Dict[int, List[int]],
         el_merges_mapping: Dict[int, List[int]],
     ) -> DoclingDocument:
         id_to_elem = {
             RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
             for elem in conv_res.assembled.elements
@@ -192,7 +179,6 @@ class ReadingOrderModel:
                             code_item.footnotes.append(new_footnote_item.get_ref())
                 else:
                     new_item, current_list = self._handle_text_element(
                         element, out_doc, current_list, page_height
                     )
@@ -206,7 +192,6 @@ class ReadingOrderModel:
                             )
             elif isinstance(element, Table):
                 tbl_data = TableData(
                     num_rows=element.num_rows,
                     num_cols=element.num_cols,
@@ -342,12 +327,12 @@ class ReadingOrderModel:
         return new_item, current_list
     def _merge_elements(self, element, merged_elem, new_item, page_height):
-        assert isinstance(
-            merged_elem, type(element)
-        ), "Merged element must be of same type as element."
-        assert (
-            merged_elem.label == new_item.label
-        ), "Labels of merged elements must match."
+        assert isinstance(merged_elem, type(element)), (
+            "Merged element must be of same type as element."
+        )
+        assert merged_elem.label == new_item.label, (
+            "Labels of merged elements must match."
+        )
         prov = ProvenanceItem(
             page_no=element.page_no + 1,
             charspan=(

docling/models/table_structure_model.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import copy
 import warnings
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Optional
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_core.types.doc.page import (
     BoundingRectangle,
-    SegmentedPdfPage,
     TextCellUnit,
 )
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
         self.enabled = enabled
         if self.enabled:
             if artifacts_path is None:
                 artifacts_path = self.download_models() / self._model_path
             else:
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
                 yield page
             else:
                 with TimeRecorder(conv_res, "table_structure"):
                     assert page.predictions.layout is not None
                     assert page.size is not None
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
                             table_out = tf_output[0]
                             table_cells = []
                             for element in table_out["tf_responses"]:
                                 if not self.do_cell_matching:
                                     the_bbox = BoundingBox.model_validate(
                                         element["bbox"]

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -3,9 +3,10 @@ import io
 import logging
 import os
 import tempfile
+from collections.abc import Iterable
 from pathlib import Path
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                 )
     def _get_name_and_version(self) -> Tuple[str, str]:
-        if self._name != None and self._version != None:
+        if self._name is not None and self._version is not None:
             return self._name, self._version  # type: ignore
         cmd = [self.options.tesseract_cmd, "--version"]
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
         # _log.info(decoded_data)
         # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
+        df_result = pd.read_csv(
+            io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
+        )
         # Display the dataframe (optional)
         # _log.info("df: ", df.head())
         # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[
-            df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
+        df_filtered = df_result[
+            df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
         ]
         return df_filtered
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
         proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
         output, _ = proc.communicate()
         decoded_data = output.decode("utf-8")
-        df = pd.read_csv(
+        df_detected = pd.read_csv(
             io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
         )
-        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
         if len(scripts) == 0:
             _log.warning("Tesseract cannot detect the script of the page")
             return None
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
         proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
         output, _ = proc.communicate()
         decoded_data = output.decode("utf-8")
-        df = pd.read_csv(io.StringIO(decoded_data), header=None)
-        self._tesseract_languages = df[0].tolist()[1:]
+        df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
+        self._tesseract_languages = df_list[0].tolist()[1:]
         # Decide the script prefix
-        if any([l.startswith("script/") for l in self._tesseract_languages]):
+        if any(lang.startswith("script/") for lang in self._tesseract_languages):
             script_prefix = "script/"
         else:
             script_prefix = ""
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         if not self.enabled:
             yield from page_batch
             return
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
                                 fname = image_file.name
                                 high_res_image.save(image_file)
-                            df = self._run_tesseract(fname)
+                            df_result = self._run_tesseract(fname)
                         finally:
                             if os.path.exists(fname):
                                 os.remove(fname)
-                        # _log.info(df)
+                        # _log.info(df_result)
                         # Print relevant columns (bounding box and text)
-                        for ix, row in df.iterrows():
+                        for ix, row in df_result.iterrows():
                             text = row["text"]
                             conf = row["conf"]
-                            l = float(row["left"])
+                            l = float(row["left"])  # noqa: E741
                             b = float(row["top"])
                             w = float(row["width"])
                             h = float(row["height"])

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
         self.options: TesseractOcrOptions
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
-        self.reader = None
-        self.osd_reader = None
-        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
         if self.enabled:
             install_errmsg = (
@@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
                 raise ImportError(install_errmsg)
             try:
                 tesseract_version = tesserocr.tesseract_version()
-            except:
+            except Exception:
                 raise ImportError(install_errmsg)
             _, self._tesserocr_languages = tesserocr.get_languages()
@@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
             _log.debug("Initializing TesserOCR: %s", tesseract_version)
             lang = "+".join(self.options.lang)
-            if any([l.startswith("script/") for l in self._tesserocr_languages]):
+            if any(lang.startswith("script/") for lang in self._tesserocr_languages):
                 self.script_prefix = "script/"
             else:
                 self.script_prefix = ""
@@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
                 "oem": tesserocr.OEM.DEFAULT,
             }
+            self.reader = None
+            self.osd_reader = None
+            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
             if self.options.path is not None:
                 tesserocr_kwargs["path"] = self.options.path

docling/pipeline/base_pipeline.py CHANGED Viewed

@@ -3,9 +3,10 @@ import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Iterable, List
+from collections.abc import Iterable
+from typing import Any, Callable, List
-from docling_core.types.doc import DoclingDocument, NodeItem
+from docling_core.types.doc import NodeItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
@@ -64,7 +65,6 @@ class BasePipeline(ABC):
         return conv_res
     def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
         def _prepare_elements(
             conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
         ) -> Iterable[NodeItem]:
@@ -113,7 +113,6 @@ class BasePipeline(ABC):
 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
     def __init__(self, pipeline_options: PipelineOptions):
         super().__init__(pipeline_options)
         self.keep_backend = False
@@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         yield from page_batch
     def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
         if not isinstance(conv_res.input._backend, PdfDocumentBackend):
             raise RuntimeError(
                 f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
@@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
         total_elapsed_time = 0.0
         with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
-            for i in range(0, conv_res.input.page_count):
+            for i in range(conv_res.input.page_count):
                 start_page, end_page = conv_res.input.limits.page_range
                 if (start_page - 1) <= i <= (end_page - 1):
                     conv_res.pages.append(Page(page_no=i))
@@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                     pipeline_pages = self._apply_on_pages(conv_res, init_pages)
                     for p in pipeline_pages:  # Must exhaust!
                         # Cleanup cached images
                         if not self.keep_images:
                             p._image_cache = {}

docling/pipeline/simple_pipeline.py CHANGED Viewed

@@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
         super().__init__(pipeline_options)
     def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
         if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
             raise RuntimeError(
                 f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "

docling/pipeline/standard_pdf_pipeline.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import sys
 import warnings
 from pathlib import Path
 from typing import Optional, cast

docling 2.30.0__py3-none-any.whl → 2.31.0__py3-none-any.whl

docling 2.30.0py3-none-any.whl → 2.31.0py3-none-any.whl