PyPI - docling - Versions diffs - 2.43.0__py3-none-any.whl → 2.44.0__py3-none-any.whl - Mend

docling 2.43.0py3-none-any.whl → 2.44.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

docling/backend/html_backend.py CHANGED Viewed

@@ -125,8 +125,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         # set the title as furniture, since it is part of the document metadata
         title = self.soup.title
         if title:
+            title_text = title.get_text(separator=" ", strip=True)
+            title_clean = HTMLDocumentBackend._clean_unicode(title_text)
             doc.add_title(
-                text=title.get_text(separator=" ", strip=True),
+                text=title_clean,
+                orig=title_text,
                 content_layer=ContentLayer.FURNITURE,
             )
         # remove scripts/styles
@@ -168,10 +171,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 return
             for part in text.split("\n"):
                 seg = part.strip()
+                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
                 if seg:
                     doc.add_text(
-                        DocItemLabel.TEXT,
-                        seg,
+                        label=DocItemLabel.TEXT,
+                        text=seg_clean,
+                        orig=seg,
                         parent=self.parents[self.level],
                         content_layer=self.content_layer,
                     )
@@ -203,13 +208,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         self.content_layer = ContentLayer.BODY
         level = int(tag_name[1])
         text = tag.get_text(strip=True, separator=" ")
+        text_clean = HTMLDocumentBackend._clean_unicode(text)
         # the first level is for the title item
         if level == 1:
             for key in self.parents.keys():
                 self.parents[key] = None
             self.level = 0
             self.parents[self.level + 1] = doc.add_title(
-                text, content_layer=self.content_layer
+                text=text_clean, orig=text, content_layer=self.content_layer
             )
         # the other levels need to be lowered by 1 if a title was set
         else:
@@ -234,7 +240,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                 self.level = level
             self.parents[self.level + 1] = doc.add_heading(
                 parent=self.parents[self.level],
-                text=text,
+                text=text_clean,
+                orig=text,
                 level=self.level,
                 content_layer=self.content_layer,
             )
@@ -296,13 +303,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                         if text_part:
                             parts.append(text_part)
                 li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
+                li_clean = HTMLDocumentBackend._clean_unicode(li_text)
                 # 3) add the list item
                 if li_text:
                     self.parents[self.level + 1] = doc.add_list_item(
-                        text=li_text,
+                        text=li_clean,
                         enumerated=is_ordered,
                         marker=marker,
+                        orig=li_text,
                         parent=list_group,
                         content_layer=self.content_layer,
                     )
@@ -344,11 +353,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         elif tag_name in {"p", "address", "summary"}:
             for part in tag.text.split("\n"):
                 seg = part.strip()
+                seg_clean = HTMLDocumentBackend._clean_unicode(seg)
                 if seg:
                     doc.add_text(
-                        parent=self.parents[self.level],
                         label=DocItemLabel.TEXT,
-                        text=seg,
+                        text=seg_clean,
+                        orig=seg,
+                        parent=self.parents[self.level],
                         content_layer=self.content_layer,
                     )
             for img_tag in tag("img"):
@@ -370,10 +381,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         elif tag_name in {"pre", "code"}:
             # handle monospace code snippets (pre).
             text = tag.get_text(strip=True)
+            text_clean = HTMLDocumentBackend._clean_unicode(text)
             if text:
                 doc.add_code(
                     parent=self.parents[self.level],
-                    text=text,
+                    text=text_clean,
+                    orig=text,
                     content_layer=self.content_layer,
                 )
@@ -402,8 +415,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         caption_item: Optional[TextItem] = None
         if caption:
+            caption_clean = HTMLDocumentBackend._clean_unicode(caption)
             caption_item = doc.add_text(
-                DocItemLabel.CAPTION, text=caption, content_layer=self.content_layer
+                label=DocItemLabel.CAPTION,
+                text=caption_clean,
+                orig=caption,
+                content_layer=self.content_layer,
             )
         doc.add_picture(
@@ -442,6 +459,46 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
         return "".join(parts)
+    @staticmethod
+    def _clean_unicode(text: str) -> str:
+        """Replace typical Unicode characters in HTML for text processing.
+        Several Unicode characters (e.g., non-printable or formatting) are typically
+        found in HTML but are worth replacing to sanitize text and ensure consistency
+        in text processing tasks.
+        Args:
+            text: The original text.
+        Returns:
+            The sanitized text without typical Unicode characters.
+        """
+        replacements = {
+            "\u00a0": " ",  # non-breaking space
+            "\u200b": "",  # zero-width space
+            "\u200c": "",  # zero-width non-joiner
+            "\u200d": "",  # zero-width joiner
+            "\u2010": "-",  # hyphen
+            "\u2011": "-",  # non-breaking hyphen
+            "\u2012": "-",  # dash
+            "\u2013": "-",  # dash
+            "\u2014": "-",  # dash
+            "\u2015": "-",  # horizontal bar
+            "\u2018": "'",  # left single quotation mark
+            "\u2019": "'",  # right single quotation mark
+            "\u201c": '"',  # left double quotation mark
+            "\u201d": '"',  # right double quotation mark
+            "\u2026": "...",  # ellipsis
+            "\u00ad": "",  # soft hyphen
+            "\ufeff": "",  # zero width non-break space
+            "\u202f": " ",  # narrow non-break space
+            "\u2060": "",  # word joiner
+        }
+        for raw, clean in replacements.items():
+            text = text.replace(raw, clean)
+        return text
     @staticmethod
     def _get_cell_spans(cell: Tag) -> tuple[int, int]:
         """Extract colspan and rowspan values from a table cell tag.
@@ -454,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
             str(cell.get("colspan", "1")),
             str(cell.get("rowspan", "1")),
         )
+        def _extract_num(s: str) -> int:
+            if s and s[0].isnumeric():
+                match = re.search(r"\d+", s)
+                if match:
+                    return int(match.group())
+            return 1
         int_spans: tuple[int, int] = (
-            int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
-            int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
+            _extract_num(raw_spans[0]),
+            _extract_num(raw_spans[1]),
         )
         return int_spans

docling/cli/main.py CHANGED Viewed

@@ -262,6 +262,12 @@ def export_documents(
         else:
             _log.warning(f"Document {conv_res.input.file} failed to convert.")
+            if _log.isEnabledFor(logging.INFO):
+                for err in conv_res.errors:
+                    _log.info(
+                        f"  [Failure Detail] Component: {err.component_type}, "
+                        f"Module: {err.module_name}, Message: {err.error_message}"
+                    )
             failure_count += 1
     _log.info(

docling/document_converter.py CHANGED Viewed

@@ -5,7 +5,9 @@ import threading
 import time
 from collections.abc import Iterable, Iterator
 from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
 from functools import partial
+from io import BytesIO
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@@ -275,6 +277,34 @@ class DocumentConverter:
                 "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
             )
+    @validate_call(config=ConfigDict(strict=True))
+    def convert_string(
+        self,
+        content: str,
+        format: InputFormat,
+        name: Optional[str],
+    ) -> ConversionResult:
+        name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        if format == InputFormat.MD:
+            if not name.endswith(".md"):
+                name += ".md"
+            buff = BytesIO(content.encode("utf-8"))
+            doc_stream = DocumentStream(name=name, stream=buff)
+            return self.convert(doc_stream)
+        elif format == InputFormat.HTML:
+            if not name.endswith(".html"):
+                name += ".html"
+            buff = BytesIO(content.encode("utf-8"))
+            doc_stream = DocumentStream(name=name, stream=buff)
+            return self.convert(doc_stream)
+        else:
+            raise ValueError(f"format {format} is not supported in `convert_string`")
     def _convert(
         self, conv_input: _DocumentConversionInput, raises_on_error: bool
     ) -> Iterator[ConversionResult]:

docling/models/vlm_models_inline/mlx_model.py CHANGED Viewed

@@ -35,9 +35,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
         if self.enabled:
             try:
-                from mlx_vlm import generate, load  # type: ignore
+                from mlx_vlm import generate, load, stream_generate  # type: ignore
                 from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
-                from mlx_vlm.utils import load_config, stream_generate  # type: ignore
+                from mlx_vlm.utils import load_config  # type: ignore
             except ImportError:
                 raise ImportError(
                     "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."

{docling-2.43.0.dist-info → docling-2.44.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling
-Version: 2.43.0
+Version: 2.44.0
 Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
 Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
 License-Expression: MIT
@@ -58,7 +58,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
 Provides-Extra: vlm
 Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
 Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
-Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
+Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
 Provides-Extra: rapidocr
 Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
 Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"

{docling-2.43.0.dist-info → docling-2.44.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/document_converter.py,sha256=pYlozCp6X1iGO75m3KSudMfrSCrXihTlRpKARFN67BI,14757
+docling/document_converter.py,sha256=l4b9m9NcbnwzXNNvf777nszyXznQJiaTXyIl_WehkyQ,15724
 docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
 docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
 docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -9,7 +9,7 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
 docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
 docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
 docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
-docling/backend/html_backend.py,sha256=Nuzyp6kyjd0g_MsBEPiWdFWU5w9UM60yWSluwU5C0M4,20310
+docling/backend/html_backend.py,sha256=0_l-I9gBAs0HKU3yKLQ3OqyYgB3V48hInv42GudnSjA,22856
 docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
 docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
 docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
@@ -28,7 +28,7 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
 docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
 docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
 docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-docling/cli/main.py,sha256=D2gEoArnQ2yQ9BesH9CkxZbYQyhZRGgjjNWYqmRRUtU,29617
+docling/cli/main.py,sha256=rXWR2QJFLeHLPWkMsLXvsVblX-KOXwbM8r0ku80KU5Q,29925
 docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
 docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
 docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -71,7 +71,7 @@ docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
 docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
 docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6DQMLtCQQW3-YUPDMbgeD2tjfM8vLM,8415
-docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
+docling/models/vlm_models_inline/mlx_model.py,sha256=tqbJ8tmf2VBDuMLYIv9s1Ysn3G831k2uE_PdOv0kCaE,5948
 docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
 docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
@@ -92,9 +92,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
 docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
 docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
 docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
-docling-2.43.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
-docling-2.43.0.dist-info/METADATA,sha256=HS5J6rDKaZ_G_d4p10XgAwrNe-FjmHV-u5EmoTP4hro,10458
-docling-2.43.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-docling-2.43.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
-docling-2.43.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
-docling-2.43.0.dist-info/RECORD,,
+docling-2.44.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
+docling-2.44.0.dist-info/METADATA,sha256=SjD3EXlvgfyXIo8YoeldcAFX0r_nbJszp7VPoMLPFBk,10459
+docling-2.44.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+docling-2.44.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
+docling-2.44.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
+docling-2.44.0.dist-info/RECORD,,

{docling-2.43.0.dist-info → docling-2.44.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{docling-2.43.0.dist-info → docling-2.44.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{docling-2.43.0.dist-info → docling-2.44.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{docling-2.43.0.dist-info → docling-2.44.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

docling 2.43.0__py3-none-any.whl → 2.44.0__py3-none-any.whl

docling 2.43.0py3-none-any.whl → 2.44.0py3-none-any.whl