PyPI - docling-haystack - Versions diffs - 0.2.0__tar.gz → 0.4.0__tar.gz - Mend

docling-haystack 0.2.0tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

docling_haystack-0.4.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,20 @@
+# Changelog
+## [integrations/docling-v0.3.0] - 2026-04-10
+### 🚀 Features
+- (docling) Drop temp files for ByteStream sources (#3130)
+## [integrations/docling-v0.2.0] - 2026-04-08
+### 🚀 Features
+- Add Docling document converter (#3066)
+### 🚜 Refactor
+- *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
+<!-- generated by git-cliff -->

{docling_haystack-0.2.0 → docling_haystack-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling-haystack
-Version: 0.2.0
+Version: 0.4.0
 Summary: Haystack integration for docling
 Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
 Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues

{docling_haystack-0.2.0 → docling_haystack-0.4.0}/src/haystack_integrations/components/converters/docling/converter.py RENAMED Viewed

@@ -1,22 +1,50 @@
 """Docling Haystack converter module."""
 import json
-import os
-import tempfile
+import mimetypes
 import warnings
 from abc import ABC, abstractmethod
 from enum import Enum
+from io import BytesIO
 from pathlib import Path
 from typing import Any
-from haystack import Document, component
+from docling_core.types.io import DocumentStream
+from haystack import Document, component, logging
 from haystack.components.converters.utils import normalize_metadata
+from haystack.core.serialization import default_from_dict, default_to_dict
 from haystack.dataclasses import ByteStream
+from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
 from docling.chunking import BaseChunk, BaseChunker, HybridChunker
 from docling.datamodel.document import DoclingDocument
 from docling.document_converter import DocumentConverter
+logger = logging.getLogger(__name__)
+def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
+    """
+    Build a `DocumentStream` from a Haystack `ByteStream`.
+    Resolves the stream name by checking common metadata keys (`file_path`, `file_name`, `name`) and falling back to
+    MIME-type extension guessing so that docling can reliably detect the input format.
+    """
+    meta = source.meta or {}
+    raw_name = meta.get("file_path") or meta.get("file_name") or meta.get("name")
+    if raw_name:
+        name = Path(raw_name).name
+    else:
+        name = "document"
+    if not Path(name).suffix and source.mime_type:
+        ext = mimetypes.guess_extension(source.mime_type)
+        if ext:
+            name = f"{name}{ext}"
+    return DocumentStream(name=name, stream=BytesIO(source.data))
 class ExportType(str, Enum):
     """Enumeration of available export types."""
@@ -39,6 +67,15 @@ class BaseMetaExtractor(ABC):
         """Extract Docling document meta."""
         raise NotImplementedError()
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to a dictionary."""
+        return {}
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor":  # noqa: ARG003
+        """Deserialize from a dictionary."""
+        return cls()
 class MetaExtractor(BaseMetaExtractor):
     """MetaExtractor."""
@@ -99,6 +136,53 @@ class DoclingConverter:
             self._chunker_instance = chunker or HybridChunker()
         self._meta_extractor_instance = meta_extractor or MetaExtractor()
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize this component to a dictionary."""
+        if self.converter is not None:
+            logger.warning(
+                "DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
+                "The component will use the default DocumentConverter when restored from the serialized form."
+            )
+        if self.chunker is not None:
+            logger.warning(
+                "DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
+                "The component will use the default chunker when restored from the serialized form."
+            )
+        meta_extractor_data = None
+        if self.meta_extractor is not None:
+            meta_extractor_data = serialize_class_instance(self.meta_extractor)
+        return default_to_dict(
+            self,
+            converter=None,
+            convert_kwargs=self.convert_kwargs,
+            export_type=self.export_type.value,
+            md_export_kwargs=self.md_export_kwargs,
+            chunker=None,
+            meta_extractor=meta_extractor_data,
+        )
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
+        """
+        Deserialize this component from a dictionary.
+        The `converter` and `chunker` parameters are not serializable and are always ignored during
+        deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
+        respectively.
+        :param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
+        :returns: A new `DoclingConverter` instance.
+        """
+        init_params = data.get("init_parameters", {})
+        meta_extractor_data = init_params.get("meta_extractor")
+        if meta_extractor_data is not None:
+            init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)
+        return default_from_dict(cls, data)
     @component.output_types(documents=list[Document])
     def run(
         self,
@@ -141,14 +225,8 @@ class DoclingConverter:
         documents: list[Document] = []
         for source, source_meta in zip(sources, meta_list, strict=True):
             if isinstance(source, ByteStream):
-                # docling requires a file path; write ByteStream data to a temp file
-                with tempfile.NamedTemporaryFile(delete=False) as tmp:
-                    tmp.write(source.data)
-                    tmp_path = Path(tmp.name)
-                try:
-                    dl_doc = self._converter_instance.convert(source=tmp_path, **self.convert_kwargs).document
-                finally:
-                    os.unlink(tmp_path)
+                doc_stream = _bytestream_to_document_stream(source)
+                dl_doc = self._converter_instance.convert(source=doc_stream, **self.convert_kwargs).document
                 # merge ByteStream meta (e.g. file_path, mime_type) with user-supplied meta
                 merged_meta = {**(source.meta or {}), **source_meta}
             else:

{docling_haystack-0.2.0 → docling_haystack-0.4.0}/tests/test_converter.py RENAMED Viewed

@@ -1,14 +1,23 @@
 import json
+import mimetypes
 import warnings
+from io import BytesIO
 from types import SimpleNamespace
 from typing import Any
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 import pytest
-from haystack.core.serialization import component_from_dict, component_to_dict
+from docling.chunking import HybridChunker
+from docling.document_converter import DocumentConverter
+from docling_core.types.io import DocumentStream
 from haystack.dataclasses import ByteStream
-from haystack_integrations.components.converters.docling import DoclingConverter, ExportType
+from haystack_integrations.components.converters.docling import (
+    DoclingConverter,
+    ExportType,
+    MetaExtractor,
+)
+from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
 def test_run_doc_chunks_minimal() -> None:
@@ -126,8 +135,6 @@ def test_run_json_minimal() -> None:
 def test_legacy_import_path() -> None:
-    import warnings
     with warnings.catch_warnings(record=True) as caught:
         warnings.simplefilter("always")
         from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
@@ -138,63 +145,59 @@ def test_legacy_import_path() -> None:
     )
-def test_component_from_dict_legacy_nulls() -> None:
-    # Before the public-attribute refactor, default serialization couldn't find
-    # the _-prefixed attributes and fell back to the init defaults, so
-    # convert_kwargs and md_export_kwargs were always serialized as null.
-    # Verify that such a serialized dict still deserializes correctly.
-    legacy_data = {
+def test_component_to_dict_defaults() -> None:
+    converter = DoclingConverter()
+    assert converter.to_dict() == {
         "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
         "init_parameters": {
             "converter": None,
-            "convert_kwargs": None,
+            "convert_kwargs": {},
             "export_type": "doc_chunks",
-            "md_export_kwargs": None,
+            "md_export_kwargs": {"image_placeholder": ""},
             "chunker": None,
             "meta_extractor": None,
         },
     }
-    restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")
-    assert restored.convert_kwargs == {}
-    assert restored.md_export_kwargs == {"image_placeholder": ""}
-    assert restored.export_type == ExportType.DOC_CHUNKS
-    assert restored.converter is None
-    assert restored.chunker is None
-    assert restored.meta_extractor is None
-def test_component_to_dict_defaults() -> None:
-    converter = DoclingConverter()
-    data = component_to_dict(converter, "docling_converter")
-    init_params = data["init_parameters"]
-    assert init_params["converter"] is None
-    assert init_params["convert_kwargs"] == {}
-    assert init_params["export_type"] == ExportType.DOC_CHUNKS
-    assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
-    assert init_params["chunker"] is None
-    assert init_params["meta_extractor"] is None
 def test_component_to_dict_custom_params() -> None:
     converter = DoclingConverter(
+        converter=DocumentConverter(),
         convert_kwargs={"raises_on_error": False},
         export_type=ExportType.MARKDOWN,
         md_export_kwargs={"image_placeholder": "[img]"},
+        meta_extractor=MetaExtractor(),
     )
-    data = component_to_dict(converter, "docling_converter")
-    init_params = data["init_parameters"]
-    assert init_params["convert_kwargs"] == {"raises_on_error": False}
-    assert init_params["export_type"] == ExportType.MARKDOWN
-    assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"}
+    assert converter.to_dict() == {
+        "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
+        "init_parameters": {
+            "converter": None,
+            "convert_kwargs": {"raises_on_error": False},
+            "export_type": "markdown",
+            "md_export_kwargs": {"image_placeholder": "[img]"},
+            "chunker": None,
+            "meta_extractor": {
+                "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
+                "data": {},
+            },
+        },
+    }
 def test_component_from_dict_defaults() -> None:
-    converter = DoclingConverter()
-    data = component_to_dict(converter, "docling_converter")
-    restored = component_from_dict(DoclingConverter, data, "docling_converter")
+    # null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
+    data = {
+        "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
+        "init_parameters": {
+            "converter": None,
+            "convert_kwargs": None,
+            "export_type": "doc_chunks",
+            "md_export_kwargs": None,
+            "chunker": None,
+            "meta_extractor": None,
+        },
+    }
+    restored = DoclingConverter.from_dict(data)
     assert restored.converter is None
     assert restored.convert_kwargs == {}
@@ -205,17 +208,44 @@ def test_component_from_dict_defaults() -> None:
 def test_component_from_dict_custom_params() -> None:
-    converter = DoclingConverter(
-        convert_kwargs={"raises_on_error": False},
-        export_type=ExportType.JSON,
-        md_export_kwargs={"image_placeholder": "[img]"},
-    )
-    data = component_to_dict(converter, "docling_converter")
-    restored = component_from_dict(DoclingConverter, data, "docling_converter")
+    data = {
+        "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
+        "init_parameters": {
+            "converter": None,
+            "convert_kwargs": {"raises_on_error": False},
+            "export_type": "json",
+            "md_export_kwargs": {"image_placeholder": "[img]"},
+            "chunker": None,
+            "meta_extractor": {
+                "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
+                "data": {},
+            },
+        },
+    }
+    restored = DoclingConverter.from_dict(data)
+    assert restored.converter is None
     assert restored.convert_kwargs == {"raises_on_error": False}
     assert restored.export_type == ExportType.JSON
     assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
+    assert restored.chunker is None
+    assert isinstance(restored.meta_extractor, MetaExtractor)
+def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
+    converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
+    assert converter.to_dict() == {
+        "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
+        "init_parameters": {
+            "converter": None,
+            "convert_kwargs": {},
+            "export_type": "doc_chunks",
+            "md_export_kwargs": {"image_placeholder": ""},
+            "chunker": None,
+            "meta_extractor": None,
+        },
+    }
 def test_run_with_sources_parameter() -> None:
@@ -356,13 +386,129 @@ def test_run_with_bytestream_source() -> None:
     bytestream = ByteStream(data=b"%PDF-1.4 fake pdf content", meta={"file_path": "uploaded.pdf"})
-    with patch("os.unlink"):
-        result = converter.run(sources=[bytestream])
+    result = converter.run(sources=[bytestream])
     documents = result["documents"]
     assert len(documents) == 1
     # ByteStream meta is merged into the output document
     assert documents[0].meta["file_path"] == "uploaded.pdf"
-    # docling was called with a temp file path, not the ByteStream directly
+    # docling was called with a DocumentStream, not a temp file path
     call_args = converter_mock.convert.call_args
-    assert call_args.kwargs["source"] != bytestream
+    passed_source = call_args.kwargs["source"]
+    assert isinstance(passed_source, DocumentStream)
+    assert passed_source.name == "uploaded.pdf"
+    assert isinstance(passed_source.stream, BytesIO)
+class TestBytestreamToDocumentStream:
+    def test_uses_file_path(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "report.pdf"
+        assert ds.stream.read() == b"data"
+    def test_strips_directory_from_file_path(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_path": "/some/deep/path/report.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "report.pdf"
+    def test_uses_file_name_key(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_name": "slide-deck.pptx"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "slide-deck.pptx"
+    def test_uses_name_key(self) -> None:
+        bs = ByteStream(data=b"data", meta={"name": "notes.docx"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "notes.docx"
+    def test_file_path_takes_priority_over_file_name(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_path": "real.pdf", "file_name": "other.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "real.pdf"
+    def test_file_name_takes_priority_over_name(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_name": "chosen.pdf", "name": "ignored.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "chosen.pdf"
+    def test_guesses_extension_from_mime_type(self) -> None:
+        mime = "application/pdf"
+        expected_ext = mimetypes.guess_extension(mime)
+        bs = ByteStream(data=b"data", meta={"file_path": "report"}, mime_type=mime)
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == f"report{expected_ext}"
+    def test_keeps_extension_when_present(self) -> None:
+        # mime_type should not override an already-present extension
+        bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"}, mime_type="text/plain")
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "report.pdf"
+    def test_no_meta_no_mime_type(self) -> None:
+        bs = ByteStream(data=b"data")
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "document"
+    def test_no_meta_with_mime_type(self) -> None:
+        mime = "application/pdf"
+        expected_ext = mimetypes.guess_extension(mime)
+        bs = ByteStream(data=b"data", mime_type=mime)
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == f"document{expected_ext}"
+    def test_empty_meta_no_mime_type(self) -> None:
+        bs = ByteStream(data=b"data", meta={})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "document"
+    def test_returns_document_stream_with_bytesio(self) -> None:
+        bs = ByteStream(data=b"hello", meta={"file_path": "f.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert isinstance(ds, DocumentStream)
+        assert isinstance(ds.stream, BytesIO)
+    def test_unknown_mime_type_keeps_base_name(self) -> None:
+        # mimetypes.guess_extension returns None for unknown types, so the name stays as-is.
+        assert mimetypes.guess_extension("application/x-totally-made-up-type") is None
+        bs = ByteStream(
+            data=b"data",
+            meta={"file_path": "report"},
+            mime_type="application/x-totally-made-up-type",
+        )
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "report"
+class TestMetaExtractor:
+    def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
+        chunk = MagicMock()
+        chunk.export_json_dict.return_value = {"some": "dict"}
+        result = MetaExtractor().extract_chunk_meta(chunk=chunk)
+        assert result == {"dl_meta": {"some": "dict"}}
+        chunk.export_json_dict.assert_called_once_with()
+    def test_extract_dl_doc_meta_with_origin(self) -> None:
+        dl_doc = MagicMock()
+        dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
+        result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
+        assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}}
+        dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True)
+    def test_extract_dl_doc_meta_without_origin(self) -> None:
+        dl_doc = MagicMock()
+        dl_doc.origin = None
+        result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
+        assert result == {}
+def test_run_without_sources_or_paths_raises_value_error() -> None:
+    converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
+    with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
+        converter.run()