PyPI - docling-haystack - Versions diffs - 0.2.0__tar.gz → 0.3.0__tar.gz - Mend

docling-haystack 0.2.0tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

docling_haystack-0.3.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,13 @@
+# Changelog
+## [integrations/docling-v0.2.0] - 2026-04-08
+### 🚀 Features
+- Add Docling document converter (#3066)
+### 🚜 Refactor
+- *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
+<!-- generated by git-cliff -->

{docling_haystack-0.2.0 → docling_haystack-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: docling-haystack
-Version: 0.2.0
+Version: 0.3.0
 Summary: Haystack integration for docling
 Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
 Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues

{docling_haystack-0.2.0 → docling_haystack-0.3.0}/src/haystack_integrations/components/converters/docling/converter.py RENAMED Viewed

@@ -1,14 +1,15 @@
 """Docling Haystack converter module."""
 import json
-import os
-import tempfile
+import mimetypes
 import warnings
 from abc import ABC, abstractmethod
 from enum import Enum
+from io import BytesIO
 from pathlib import Path
 from typing import Any
+from docling_core.types.io import DocumentStream
 from haystack import Document, component
 from haystack.components.converters.utils import normalize_metadata
 from haystack.dataclasses import ByteStream
@@ -18,6 +19,29 @@ from docling.datamodel.document import DoclingDocument
 from docling.document_converter import DocumentConverter
+def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
+    """
+    Build a `DocumentStream` from a Haystack `ByteStream`.
+    Resolves the stream name by checking common metadata keys (`file_path`, `file_name`, `name`) and falling back to
+    MIME-type extension guessing so that docling can reliably detect the input format.
+    """
+    meta = source.meta or {}
+    raw_name = meta.get("file_path") or meta.get("file_name") or meta.get("name")
+    if raw_name:
+        name = Path(raw_name).name
+    else:
+        name = "document"
+    if not Path(name).suffix and source.mime_type:
+        ext = mimetypes.guess_extension(source.mime_type)
+        if ext:
+            name = f"{name}{ext}"
+    return DocumentStream(name=name, stream=BytesIO(source.data))
 class ExportType(str, Enum):
     """Enumeration of available export types."""
@@ -141,14 +165,8 @@ class DoclingConverter:
         documents: list[Document] = []
         for source, source_meta in zip(sources, meta_list, strict=True):
             if isinstance(source, ByteStream):
-                # docling requires a file path; write ByteStream data to a temp file
-                with tempfile.NamedTemporaryFile(delete=False) as tmp:
-                    tmp.write(source.data)
-                    tmp_path = Path(tmp.name)
-                try:
-                    dl_doc = self._converter_instance.convert(source=tmp_path, **self.convert_kwargs).document
-                finally:
-                    os.unlink(tmp_path)
+                doc_stream = _bytestream_to_document_stream(source)
+                dl_doc = self._converter_instance.convert(source=doc_stream, **self.convert_kwargs).document
                 # merge ByteStream meta (e.g. file_path, mime_type) with user-supplied meta
                 merged_meta = {**(source.meta or {}), **source_meta}
             else:

{docling_haystack-0.2.0 → docling_haystack-0.3.0}/tests/test_converter.py RENAMED Viewed

@@ -1,14 +1,18 @@
 import json
+import mimetypes
 import warnings
+from io import BytesIO
 from types import SimpleNamespace
 from typing import Any
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 import pytest
+from docling_core.types.io import DocumentStream
 from haystack.core.serialization import component_from_dict, component_to_dict
 from haystack.dataclasses import ByteStream
 from haystack_integrations.components.converters.docling import DoclingConverter, ExportType
+from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
 def test_run_doc_chunks_minimal() -> None:
@@ -356,13 +360,84 @@ def test_run_with_bytestream_source() -> None:
     bytestream = ByteStream(data=b"%PDF-1.4 fake pdf content", meta={"file_path": "uploaded.pdf"})
-    with patch("os.unlink"):
-        result = converter.run(sources=[bytestream])
+    result = converter.run(sources=[bytestream])
     documents = result["documents"]
     assert len(documents) == 1
     # ByteStream meta is merged into the output document
     assert documents[0].meta["file_path"] == "uploaded.pdf"
-    # docling was called with a temp file path, not the ByteStream directly
+    # docling was called with a DocumentStream, not a temp file path
     call_args = converter_mock.convert.call_args
-    assert call_args.kwargs["source"] != bytestream
+    passed_source = call_args.kwargs["source"]
+    assert isinstance(passed_source, DocumentStream)
+    assert passed_source.name == "uploaded.pdf"
+    assert isinstance(passed_source.stream, BytesIO)
+class TestBytestreamToDocumentStream:
+    def test_uses_file_path(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "report.pdf"
+        assert ds.stream.read() == b"data"
+    def test_strips_directory_from_file_path(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_path": "/some/deep/path/report.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "report.pdf"
+    def test_uses_file_name_key(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_name": "slide-deck.pptx"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "slide-deck.pptx"
+    def test_uses_name_key(self) -> None:
+        bs = ByteStream(data=b"data", meta={"name": "notes.docx"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "notes.docx"
+    def test_file_path_takes_priority_over_file_name(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_path": "real.pdf", "file_name": "other.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "real.pdf"
+    def test_file_name_takes_priority_over_name(self) -> None:
+        bs = ByteStream(data=b"data", meta={"file_name": "chosen.pdf", "name": "ignored.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "chosen.pdf"
+    def test_guesses_extension_from_mime_type(self) -> None:
+        mime = "application/pdf"
+        expected_ext = mimetypes.guess_extension(mime)
+        bs = ByteStream(data=b"data", meta={"file_path": "report"}, mime_type=mime)
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == f"report{expected_ext}"
+    def test_keeps_extension_when_present(self) -> None:
+        # mime_type should not override an already-present extension
+        bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"}, mime_type="text/plain")
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "report.pdf"
+    def test_no_meta_no_mime_type(self) -> None:
+        bs = ByteStream(data=b"data")
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "document"
+    def test_no_meta_with_mime_type(self) -> None:
+        mime = "application/pdf"
+        expected_ext = mimetypes.guess_extension(mime)
+        bs = ByteStream(data=b"data", mime_type=mime)
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == f"document{expected_ext}"
+    def test_empty_meta_no_mime_type(self) -> None:
+        bs = ByteStream(data=b"data", meta={})
+        ds = _bytestream_to_document_stream(bs)
+        assert ds.name == "document"
+    def test_returns_document_stream_with_bytesio(self) -> None:
+        bs = ByteStream(data=b"hello", meta={"file_path": "f.pdf"})
+        ds = _bytestream_to_document_stream(bs)
+        assert isinstance(ds, DocumentStream)
+        assert isinstance(ds.stream, BytesIO)