PyPI - docling - Versions diffs - 2.16.0__py3-none-any.whl → 2.17.0__py3-none-any.whl - Mend

docling 2.16.0py3-none-any.whl → 2.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

docling/backend/html_backend.py +3 -2
docling/backend/md_backend.py +4 -8
docling/backend/xml/uspto_backend.py +25 -25
docling/cli/main.py +18 -3
docling/datamodel/document.py +2 -0
docling/datamodel/pipeline_options.py +1 -0
docling/models/rapid_ocr_model.py +1 -0
docling/models/tesseract_ocr_cli_model.py +72 -4
docling/models/tesseract_ocr_model.py +37 -37
docling/utils/ocr_utils.py +9 -0
{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/METADATA +13 -10
{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/RECORD +15 -14
{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0

docling/backend/html_backend.py CHANGED Viewed

@@ -78,10 +78,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         if self.is_valid():
             assert self.soup is not None
+            content = self.soup.body or self.soup
             # Replace <br> tags with newline characters
-            for br in self.soup.body.find_all("br"):
+            for br in content.find_all("br"):
                 br.replace_with("\n")
-            doc = self.walk(self.soup.body, doc)
+            doc = self.walk(content, doc)
         else:
             raise RuntimeError(
                 f"Cannot convert doc with {self.document_hash} because the backend failed to init."

docling/backend/md_backend.py CHANGED Viewed

@@ -65,7 +65,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
         self.in_table = False
         self.md_table_buffer: list[str] = []
-        self.inline_text_buffer = ""
+        self.inline_texts: list[str] = []
         try:
             if isinstance(self.path_or_stream, BytesIO):
@@ -152,15 +152,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
     def process_inline_text(
         self, parent_element: Optional[NodeItem], doc: DoclingDocument
     ):
-        # self.inline_text_buffer += str(text_in)
-        txt = self.inline_text_buffer.strip()
+        txt = " ".join(self.inline_texts)
         if len(txt) > 0:
             doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 parent=parent_element,
                 text=txt,
             )
-        self.inline_text_buffer = ""
+        self.inline_texts = []
     def iterate_elements(
         self,
@@ -266,9 +265,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 self.close_table(doc)
                 self.in_table = False
                 # most likely just inline text
-                self.inline_text_buffer += str(
-                    element.children
-                )  # do not strip an inline text, as it may contain important spaces
+                self.inline_texts.append(str(element.children))
         elif isinstance(element, marko.inline.CodeSpan):
             self.close_table(doc)
@@ -292,7 +289,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             doc.add_code(parent=parent_element, text=snippet_text)
         elif isinstance(element, marko.inline.LineBreak):
-            self.process_inline_text(parent_element, doc)
             if self.in_table:
                 _log.debug("Line break in a table")
                 self.md_table_buffer.append("")

docling/backend/xml/uspto_backend.py CHANGED Viewed

@@ -389,7 +389,7 @@ class PatentUsptoIce(PatentUspto):
             if name == self.Element.TITLE.value:
                 if text:
                     self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         text=text,
                     )
                     self.level += 1
@@ -406,7 +406,7 @@ class PatentUsptoIce(PatentUspto):
                     abstract_item = self.doc.add_heading(
                         heading_text,
                         level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                     )
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
@@ -434,7 +434,7 @@ class PatentUsptoIce(PatentUspto):
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -452,7 +452,7 @@ class PatentUsptoIce(PatentUspto):
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
                         text=text,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                 self.text = ""
@@ -460,7 +460,7 @@ class PatentUsptoIce(PatentUspto):
                 self.parents[self.level + 1] = self.doc.add_heading(
                     text=text,
                     level=self.level,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
                 self.level += 1
                 self.text = ""
@@ -470,7 +470,7 @@ class PatentUsptoIce(PatentUspto):
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         def _apply_style(self, text: str, style_tag: str) -> str:
@@ -721,7 +721,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 if self.Element.TITLE.value in self.property and text.strip():
                     title = text.strip()
                     self.parents[self.level + 1] = self.doc.add_title(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         text=title,
                     )
                     self.level += 1
@@ -749,7 +749,7 @@ class PatentUsptoGrantV2(PatentUspto):
                     self.parents[self.level + 1] = self.doc.add_heading(
                         text=text.strip(),
                         level=self.level,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                     self.level += 1
@@ -769,7 +769,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -787,7 +787,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 abstract_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
@@ -799,7 +799,7 @@ class PatentUsptoGrantV2(PatentUspto):
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
                         text=paragraph,
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                     )
                 elif self.Element.CLAIM.value in self.property:
                     # we may need a space after a paragraph in claim text
@@ -811,7 +811,7 @@ class PatentUsptoGrantV2(PatentUspto):
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         def _apply_style(self, text: str, style_tag: str) -> str:
@@ -938,7 +938,7 @@ class PatentUsptoGrantAps(PatentUspto):
         self.parents[self.level + 1] = self.doc.add_heading(
             heading.value,
             level=self.level,
-            parent=self.parents[self.level],  # type: ignore[arg-type]
+            parent=self.parents[self.level],
         )
         self.level += 1
@@ -959,7 +959,7 @@ class PatentUsptoGrantAps(PatentUspto):
         if field == self.Field.TITLE.value:
             self.parents[self.level + 1] = self.doc.add_title(
-                parent=self.parents[self.level], text=value  # type: ignore[arg-type]
+                parent=self.parents[self.level], text=value
             )
             self.level += 1
@@ -971,14 +971,14 @@ class PatentUsptoGrantAps(PatentUspto):
                 self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH,
                     text=value,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
             self.doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 text="",
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
         elif (
@@ -996,7 +996,7 @@ class PatentUsptoGrantAps(PatentUspto):
                 last_claim = self.doc.add_text(
                     label=DocItemLabel.PARAGRAPH,
                     text="",
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
             last_claim.text += f" {value}" if last_claim.text else value
@@ -1012,7 +1012,7 @@ class PatentUsptoGrantAps(PatentUspto):
             self.parents[self.level + 1] = self.doc.add_heading(
                 value,
                 level=self.level,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
             self.level += 1
@@ -1029,7 +1029,7 @@ class PatentUsptoGrantAps(PatentUspto):
             self.doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
                 text=value,
-                parent=self.parents[self.level],  # type: ignore[arg-type]
+                parent=self.parents[self.level],
             )
     def parse(self, patent_content: str) -> Optional[DoclingDocument]:
@@ -1283,7 +1283,7 @@ class PatentUsptoAppV1(PatentUspto):
                 title = text.strip()
                 if title:
                     self.parents[self.level + 1] = self.doc.add_text(
-                        parent=self.parents[self.level],  # type: ignore[arg-type]
+                        parent=self.parents[self.level],
                         label=DocItemLabel.TITLE,
                         text=title,
                     )
@@ -1301,7 +1301,7 @@ class PatentUsptoAppV1(PatentUspto):
                     abstract_item = self.doc.add_heading(
                         heading_text,
                         level=heading_level,
-                        parent=self.parents[heading_level],  # type: ignore[arg-type]
+                        parent=self.parents[heading_level],
                     )
                     self.doc.add_text(
                         label=DocItemLabel.PARAGRAPH,
@@ -1331,7 +1331,7 @@ class PatentUsptoAppV1(PatentUspto):
                 claims_item = self.doc.add_heading(
                     heading_text,
                     level=heading_level,
-                    parent=self.parents[heading_level],  # type: ignore[arg-type]
+                    parent=self.parents[heading_level],
                 )
                 for text in self.claims:
                     self.doc.add_text(
@@ -1350,14 +1350,14 @@ class PatentUsptoAppV1(PatentUspto):
                         self.parents[self.level + 1] = self.doc.add_heading(
                             text=text,
                             level=self.level,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                         )
                         self.level += 1
                     else:
                         self.doc.add_text(
                             label=DocItemLabel.PARAGRAPH,
                             text=text,
-                            parent=self.parents[self.level],  # type: ignore[arg-type]
+                            parent=self.parents[self.level],
                         )
                 self.text = ""
@@ -1366,7 +1366,7 @@ class PatentUsptoAppV1(PatentUspto):
                 empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
                 self.doc.add_table(
                     data=empty_table,
-                    parent=self.parents[self.level],  # type: ignore[arg-type]
+                    parent=self.parents[self.level],
                 )
         def _apply_style(self, text: str, style_tag: str) -> str:

docling/cli/main.py CHANGED Viewed

@@ -1,18 +1,18 @@
 import importlib
-import json
 import logging
+import platform
 import re
+import sys
 import tempfile
 import time
 import warnings
-from enum import Enum
 from pathlib import Path
 from typing import Annotated, Dict, Iterable, List, Optional, Type
 import typer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
-from pydantic import TypeAdapter, ValidationError
+from pydantic import TypeAdapter
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -65,10 +65,15 @@ def version_callback(value: bool):
         docling_core_version = importlib.metadata.version("docling-core")
         docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
         docling_parse_version = importlib.metadata.version("docling-parse")
+        platform_str = platform.platform()
+        py_impl_version = sys.implementation.cache_tag
+        py_lang_version = platform.python_version()
         print(f"Docling version: {docling_version}")
         print(f"Docling Core version: {docling_core_version}")
         print(f"Docling IBM Models version: {docling_ibm_models_version}")
         print(f"Docling Parse version: {docling_parse_version}")
+        print(f"Python: {py_impl_version} ({py_lang_version})")
+        print(f"Platform: {platform_str}")
         raise typer.Exit()
@@ -206,6 +211,14 @@ def convert(
         TableFormerMode,
         typer.Option(..., help="The mode to use in the table structure model."),
     ] = TableFormerMode.FAST,
+    enrich_code: Annotated[
+        bool,
+        typer.Option(..., help="Enable the code enrichment model in the pipeline."),
+    ] = False,
+    enrich_formula: Annotated[
+        bool,
+        typer.Option(..., help="Enable the formula enrichment model in the pipeline."),
+    ] = False,
     artifacts_path: Annotated[
         Optional[Path],
         typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -360,6 +373,8 @@ def convert(
             do_ocr=ocr,
             ocr_options=ocr_options,
             do_table_structure=True,
+            do_code_enrichment=enrich_code,
+            do_formula_enrichment=enrich_formula,
             document_timeout=document_timeout,
         )
         pipeline_options.table_structure_options.do_cell_matching = (

docling/datamodel/document.py CHANGED Viewed

@@ -352,6 +352,8 @@ class _DocumentConversionInput(BaseModel):
             mime = FormatToMimeType[InputFormat.MD][0]
         elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
             mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
+        elif ext in FormatToExtensions[InputFormat.PDF]:
+            mime = FormatToMimeType[InputFormat.PDF][0]
         return mime
     @staticmethod

docling/datamodel/pipeline_options.py CHANGED Viewed

@@ -119,6 +119,7 @@ class RapidOcrOptions(OcrOptions):
     det_model_path: Optional[str] = None  # same default as rapidocr
     cls_model_path: Optional[str] = None  # same default as rapidocr
     rec_model_path: Optional[str] = None  # same default as rapidocr
+    rec_keys_path: Optional[str] = None  # same default as rapidocr
     model_config = ConfigDict(
         extra="forbid",

docling/models/rapid_ocr_model.py CHANGED Viewed

@@ -59,6 +59,7 @@ class RapidOcrModel(BaseOcrModel):
                 det_model_path=self.options.det_model_path,
                 cls_model_path=self.options.cls_model_path,
                 rec_model_path=self.options.rec_model_path,
+                rec_keys_path=self.options.rec_keys_path,
             )
     def __call__(

docling/models/tesseract_ocr_cli_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 import os
 import tempfile
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -14,6 +14,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.ocr_utils import map_tesseract_script
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -28,10 +29,13 @@ class TesseractOcrCliModel(BaseOcrModel):
         self._name: Optional[str] = None
         self._version: Optional[str] = None
+        self._tesseract_languages: Optional[List[str]] = None
+        self._script_prefix: Optional[str] = None
         if self.enabled:
             try:
                 self._get_name_and_version()
+                self._set_languages_and_prefix()
             except Exception as exc:
                 raise RuntimeError(
@@ -73,12 +77,20 @@ class TesseractOcrCliModel(BaseOcrModel):
         return name, version
     def _run_tesseract(self, ifilename: str):
+        r"""
+        Run tesseract CLI
+        """
         cmd = [self.options.tesseract_cmd]
-        if self.options.lang is not None and len(self.options.lang) > 0:
+        if "auto" in self.options.lang:
+            lang = self._detect_language(ifilename)
+            if lang is not None:
+                cmd.append("-l")
+                cmd.append(lang)
+        elif self.options.lang is not None and len(self.options.lang) > 0:
             cmd.append("-l")
             cmd.append("+".join(self.options.lang))
         if self.options.path is not None:
             cmd.append("--tessdata-dir")
             cmd.append(self.options.path)
@@ -106,6 +118,63 @@ class TesseractOcrCliModel(BaseOcrModel):
         return df_filtered
+    def _detect_language(self, ifilename: str):
+        r"""
+        Run tesseract in PSM 0 mode to detect the language
+        """
+        assert self._tesseract_languages is not None
+        cmd = [self.options.tesseract_cmd]
+        cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
+        _log.info("command: {}".format(" ".join(cmd)))
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
+        output, _ = proc.communicate()
+        decoded_data = output.decode("utf-8")
+        df = pd.read_csv(
+            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
+        )
+        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        if len(scripts) == 0:
+            _log.warning("Tesseract cannot detect the script of the page")
+            return None
+        script = map_tesseract_script(scripts[0].strip())
+        lang = f"{self._script_prefix}{script}"
+        # Check if the detected language has been installed
+        if lang not in self._tesseract_languages:
+            msg = f"Tesseract detected the script '{script}' and language '{lang}'."
+            msg += " However this language is not installed in your system and will be ignored."
+            _log.warning(msg)
+            return None
+        _log.debug(
+            f"Using tesseract model for the detected script '{script}' and language '{lang}'"
+        )
+        return lang
+    def _set_languages_and_prefix(self):
+        r"""
+        Read and set the languages installed in tesseract and decide the script prefix
+        """
+        # Get all languages
+        cmd = [self.options.tesseract_cmd]
+        cmd.append("--list-langs")
+        _log.info("command: {}".format(" ".join(cmd)))
+        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
+        output, _ = proc.communicate()
+        decoded_data = output.decode("utf-8")
+        df = pd.read_csv(io.StringIO(decoded_data), header=None)
+        self._tesseract_languages = df[0].tolist()[1:]
+        # Decide the script prefix
+        if any([l.startswith("script/") for l in self._tesseract_languages]):
+            script_prefix = "script/"
+        else:
+            script_prefix = ""
+        self._script_prefix = script_prefix
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -120,7 +189,6 @@ class TesseractOcrCliModel(BaseOcrModel):
                 yield page
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     ocr_rects = self.get_ocr_rects(page)
                     all_ocr_cells = []

docling/models/tesseract_ocr_model.py CHANGED Viewed

@@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.ocr_utils import map_tesseract_script
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
@@ -20,6 +21,7 @@ class TesseractOcrModel(BaseOcrModel):
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
         self.reader = None
+        self.osd_reader = None
         if self.enabled:
             install_errmsg = (
@@ -47,8 +49,8 @@ class TesseractOcrModel(BaseOcrModel):
             except:
                 raise ImportError(install_errmsg)
-            _, tesserocr_languages = tesserocr.get_languages()
-            if not tesserocr_languages:
+            _, self._tesserocr_languages = tesserocr.get_languages()
+            if not self._tesserocr_languages:
                 raise ImportError(missing_langs_errmsg)
             # Initialize the tesseractAPI
@@ -57,7 +59,7 @@ class TesseractOcrModel(BaseOcrModel):
             self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
-            if any([l.startswith("script/") for l in tesserocr_languages]):
+            if any([l.startswith("script/") for l in self._tesserocr_languages]):
                 self.script_prefix = "script/"
             else:
                 self.script_prefix = ""
@@ -72,14 +74,14 @@ class TesseractOcrModel(BaseOcrModel):
                 tesserocr_kwargs["path"] = self.options.path
             if lang == "auto":
-                self.reader = tesserocr.PyTessBaseAPI(
+                self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
+                self.osd_reader = tesserocr.PyTessBaseAPI(
                     **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
                 )
             else:
                 self.reader = tesserocr.PyTessBaseAPI(
                     **{"lang": lang} | tesserocr_kwargs,
                 )
             self.reader_RIL = tesserocr.RIL
     def __del__(self):
@@ -96,8 +98,6 @@ class TesseractOcrModel(BaseOcrModel):
             yield from page_batch
             return
-        import tesserocr
         for page in page_batch:
             assert page._backend is not None
             if not page._backend.is_valid():
@@ -105,6 +105,7 @@ class TesseractOcrModel(BaseOcrModel):
             else:
                 with TimeRecorder(conv_res, "ocr"):
                     assert self.reader is not None
+                    assert self._tesserocr_languages is not None
                     ocr_rects = self.get_ocr_rects(page)
@@ -117,43 +118,42 @@ class TesseractOcrModel(BaseOcrModel):
                             scale=self.scale, cropbox=ocr_rect
                         )
-                        # Retrieve text snippets with their bounding boxes
-                        self.reader.SetImage(high_res_image)
+                        local_reader = self.reader
+                        if "auto" in self.options.lang:
+                            assert self.osd_reader is not None
-                        if self.options.lang == ["auto"]:
-                            osd = self.reader.DetectOrientationScript()
+                            self.osd_reader.SetImage(high_res_image)
+                            osd = self.osd_reader.DetectOrientationScript()
                             # No text, probably
                             if osd is None:
                                 continue
                             script = osd["script_name"]
-                            if script == "Katakana" or script == "Hiragana":
-                                script = "Japanese"
-                            elif script == "Han":
-                                script = "HanS"
-                            elif script == "Korean":
-                                script = "Hangul"
-                            _log.debug(
-                                f'Using model for the detected script "{script}"'
-                            )
-                            if script not in self.script_readers:
-                                self.script_readers[script] = tesserocr.PyTessBaseAPI(
-                                    path=self.reader.GetDatapath(),
-                                    lang=f"{self.script_prefix}{script}",
-                                    psm=tesserocr.PSM.AUTO,
-                                    init=True,
-                                    oem=tesserocr.OEM.DEFAULT,
-                                )
-                            local_reader = self.script_readers[script]
-                            local_reader.SetImage(high_res_image)
-                        else:
-                            local_reader = self.reader
+                            script = map_tesseract_script(script)
+                            lang = f"{self.script_prefix}{script}"
+                            # Check if the detected languge is present in the system
+                            if lang not in self._tesserocr_languages:
+                                msg = f"Tesseract detected the script '{script}' and language '{lang}'."
+                                msg += " However this language is not installed in your system and will be ignored."
+                                _log.warning(msg)
+                            else:
+                                if script not in self.script_readers:
+                                    import tesserocr
+                                    self.script_readers[script] = (
+                                        tesserocr.PyTessBaseAPI(
+                                            path=self.reader.GetDatapath(),
+                                            lang=lang,
+                                            psm=tesserocr.PSM.AUTO,
+                                            init=True,
+                                            oem=tesserocr.OEM.DEFAULT,
+                                        )
+                                    )
+                                local_reader = self.script_readers[script]
+                        local_reader.SetImage(high_res_image)
                         boxes = local_reader.GetComponentImages(
                             self.reader_RIL.TEXTLINE, True
                         )

docling/utils/ocr_utils.py ADDED Viewed

@@ -0,0 +1,9 @@
+def map_tesseract_script(script: str) -> str:
+    r""" """
+    if script == "Katakana" or script == "Hiragana":
+        script = "Japanese"
+    elif script == "Han":
+        script = "HanS"
+    elif script == "Korean":
+        script = "Hangul"
+    return script

{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: docling
-Version: 2.16.0
+Version: 2.17.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Home-page: https://github.com/DS4SD/docling
 License: MIT
@@ -78,22 +78,21 @@ Description-Content-Type: text/markdown
 [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
 [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
-Docling parses documents and exports them to the desired format with ease and speed.
+Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
 ## Features
-* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
-* 📑 Advanced PDF document understanding including page layout, reading order & table structures
-* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
-* 🔍 OCR support for scanned PDFs
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
+* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
+* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* 🔒 Local execution capabilities for sensitive data and air-gapped environments
+* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
+* 🔍 Extensive OCR support for scanned PDFs and images
 * 💻 Simple and convenient CLI
-Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
 ### Coming soon
-* ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
 ## Installation
@@ -177,3 +176,7 @@ For individual model usage, please refer to the model licenses found in the orig
 Docling has been brought to you by IBM.
+[supported_formats]: https://ds4sd.github.io/docling/supported_formats/
+[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
+[integrations]: https://ds4sd.github.io/docling/integrations/

{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/RECORD RENAMED Viewed

@@ -4,10 +4,10 @@ docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxA
 docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQQknNXk,14050
 docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
 docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
-docling/backend/html_backend.py,sha256=vUEfx0h24gEaHO2taQyWNs8zCkDox7kopEeMbWBXss0,15560
+docling/backend/html_backend.py,sha256=DDfQ84VQB4nF_0wgGtbYUA9luVumB5bjjoWHjESa6Tk,15596
 docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
-docling/backend/md_backend.py,sha256=ajEooDWNnWPHnPQMgUDh-K44Ch1X-sTBHqa1xBp7yJs,14645
+docling/backend/md_backend.py,sha256=PicGKM2cg4r1lztr46eC4sKbFLvGnqzrEcLTE5fW1zc,14426
 docling/backend/msexcel_backend.py,sha256=lyJc4ShJGAN2ZfNTTuhdYTF-44cZsGyn_8Djstp3IEU,12700
 docling/backend/mspowerpoint_backend.py,sha256=kOGawhcn0BFq4M_C6kW0mY8vMIB24_6R6q6GaszbSt0,15957
 docling/backend/msword_backend.py,sha256=WcQmRYmpH8o2snGoWGxNRkCtUI3mf2JL3-9CxAfDAJg,19232
@@ -15,14 +15,14 @@ docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4i
 docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
 docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/backend/xml/pubmed_backend.py,sha256=LMnpowjnxa5SydfNC00Ll840BYraL8dCJu-FfC9iSKk,20447
-docling/backend/xml/uspto_backend.py,sha256=2YsnB-WRARIAaHPL6gxHePP24GQGi-Up2_K8ZapD3k4,70974
+docling/backend/xml/uspto_backend.py,sha256=a5GxWLj2SUR5Of8TWJinhef1gKyaQSjHPVXvGiN8yG8,70324
 docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=NR7NEt8Sf3FE9D7sHpEmABM9mFMTMO5w0VPwYIIvVsk,15481
+docling/cli/main.py,sha256=K5C2yQIoM40_W3YU8a7SmneY-hWbNp_JOFPLk0NPcDI,16098
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/datamodel/base_models.py,sha256=vewP1X99qfAwiUsiC2m8CBDGiQPsGyp_WkKJHYPoYn4,7026
-docling/datamodel/document.py,sha256=R748mLCFai4MeiE8ougQrQVJF_16t3f4CUrrEes5AV0,13202
-docling/datamodel/pipeline_options.py,sha256=GA5LwywfOkcBDvG2LhDHikqDQYlFlUPJa93tPSx-vFw,7820
+docling/datamodel/document.py,sha256=vuY8S9n-_w5UQl-7C_wasrW4bSHPQeAeH4RR-MWrGW4,13315
+docling/datamodel/pipeline_options.py,sha256=f9-VQFgOdahyclGQgH_T8ZYBopkWsF_fbWbxo39ux3g,7888
 docling/datamodel/settings.py,sha256=Sw0rN_f8rdLV1eNvVeKiyET2Oe6oz9jtW3lJzniW9Do,1302
 docling/document_converter.py,sha256=qtYPEkWuMUUGmFko2in38iSHdYrjAFf_GHNoXRRvEVs,12631
 docling/exceptions.py,sha256=-FoP46rFJgz_jn5uDv2V052udEEg8gckk6uhoItchXc,85
@@ -37,10 +37,10 @@ docling/models/layout_model.py,sha256=3Fw7OM6g0j7NgItKsQOgFOCd1q6lp1DacN_db7f6QC
 docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
 docling/models/page_assemble_model.py,sha256=c5KLKwkUIdW0JcDHizWsqrpb5x_3DK28x82Q8o-3VJM,5968
 docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
-docling/models/rapid_ocr_model.py,sha256=LOIvczJs3_db2o8mtrKk-pIXgC-xqWqRLu2cjA3wvy4,4980
+docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
 docling/models/table_structure_model.py,sha256=fUpCHthO4Uk3BhA99a85BHBm51fmdE9kfqhAk3WjuBw,9392
-docling/models/tesseract_ocr_cli_model.py,sha256=FP9cnSkSyj6-EETHtabV720Fr3x9K_oBP2UuJi4VUwE,6621
-docling/models/tesseract_ocr_model.py,sha256=N27xjo8aPb5x276wKHkf_6VFwJObfosdHLo5_hCuf94,8055
+docling/models/tesseract_ocr_cli_model.py,sha256=b2Is5x2gZLS6mQWnKe0y7p6UU6hRTHDfoH4D2RQ5mx0,9310
+docling/models/tesseract_ocr_model.py,sha256=BN85u-4a-xzUY7Iw21Ib8L8kx4mgbDGiUtxBelLiJm8,8513
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/base_pipeline.py,sha256=J0ZjtincsJr-BbRgqoQozxIhDWxWFlWaS9CTPwypJFk,8621
 docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
@@ -51,11 +51,12 @@ docling/utils/accelerator_utils.py,sha256=ZjULCn-qhxqx3frF-rJmAlWdzqgUMxH5utLHbS
 docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
 docling/utils/glm_utils.py,sha256=Nfxdx0W-sl1owYncTeJmZdiPcn-jpTqK8f8TeQlDOMY,11683
 docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
+docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
 docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
-docling-2.16.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.16.0.dist-info/METADATA,sha256=wJgRO2R9Szl69jFE8gj-VGIBpkwwMWPfgytz9nDsT_E,7780
-docling-2.16.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-docling-2.16.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
-docling-2.16.0.dist-info/RECORD,,
+docling-2.17.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.17.0.dist-info/METADATA,sha256=BkpXBck-2EjuYUsn_2aAGftgdbf260baqAb9P8ZixSM,8025
+docling-2.17.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+docling-2.17.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
+docling-2.17.0.dist-info/RECORD,,

{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.16.0.dist-info → docling-2.17.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

docling 2.16.0__py3-none-any.whl → 2.17.0__py3-none-any.whl

docling 2.16.0py3-none-any.whl → 2.17.0py3-none-any.whl