docling-haystack 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ # Changelog
2
+
3
+ ## [integrations/docling-v0.2.0] - 2026-04-08
4
+
5
+ ### 🚀 Features
6
+
7
+ - Add Docling document converter (#3066)
8
+
9
+ ### 🚜 Refactor
10
+
11
+ - *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
12
+
13
+ <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-haystack
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Haystack integration for docling
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -1,14 +1,15 @@
1
1
  """Docling Haystack converter module."""
2
2
 
3
3
  import json
4
- import os
5
- import tempfile
4
+ import mimetypes
6
5
  import warnings
7
6
  from abc import ABC, abstractmethod
8
7
  from enum import Enum
8
+ from io import BytesIO
9
9
  from pathlib import Path
10
10
  from typing import Any
11
11
 
12
+ from docling_core.types.io import DocumentStream
12
13
  from haystack import Document, component
13
14
  from haystack.components.converters.utils import normalize_metadata
14
15
  from haystack.dataclasses import ByteStream
@@ -18,6 +19,29 @@ from docling.datamodel.document import DoclingDocument
18
19
  from docling.document_converter import DocumentConverter
19
20
 
20
21
 
22
+ def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
23
+ """
24
+ Build a `DocumentStream` from a Haystack `ByteStream`.
25
+
26
+ Resolves the stream name by checking common metadata keys (`file_path`, `file_name`, `name`) and falling back to
27
+ MIME-type extension guessing so that docling can reliably detect the input format.
28
+ """
29
+ meta = source.meta or {}
30
+ raw_name = meta.get("file_path") or meta.get("file_name") or meta.get("name")
31
+
32
+ if raw_name:
33
+ name = Path(raw_name).name
34
+ else:
35
+ name = "document"
36
+
37
+ if not Path(name).suffix and source.mime_type:
38
+ ext = mimetypes.guess_extension(source.mime_type)
39
+ if ext:
40
+ name = f"{name}{ext}"
41
+
42
+ return DocumentStream(name=name, stream=BytesIO(source.data))
43
+
44
+
21
45
  class ExportType(str, Enum):
22
46
  """Enumeration of available export types."""
23
47
 
@@ -141,14 +165,8 @@ class DoclingConverter:
141
165
  documents: list[Document] = []
142
166
  for source, source_meta in zip(sources, meta_list, strict=True):
143
167
  if isinstance(source, ByteStream):
144
- # docling requires a file path; write ByteStream data to a temp file
145
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
146
- tmp.write(source.data)
147
- tmp_path = Path(tmp.name)
148
- try:
149
- dl_doc = self._converter_instance.convert(source=tmp_path, **self.convert_kwargs).document
150
- finally:
151
- os.unlink(tmp_path)
168
+ doc_stream = _bytestream_to_document_stream(source)
169
+ dl_doc = self._converter_instance.convert(source=doc_stream, **self.convert_kwargs).document
152
170
  # merge ByteStream meta (e.g. file_path, mime_type) with user-supplied meta
153
171
  merged_meta = {**(source.meta or {}), **source_meta}
154
172
  else:
@@ -1,14 +1,18 @@
1
1
  import json
2
+ import mimetypes
2
3
  import warnings
4
+ from io import BytesIO
3
5
  from types import SimpleNamespace
4
6
  from typing import Any
5
- from unittest.mock import MagicMock, patch
7
+ from unittest.mock import MagicMock
6
8
 
7
9
  import pytest
10
+ from docling_core.types.io import DocumentStream
8
11
  from haystack.core.serialization import component_from_dict, component_to_dict
9
12
  from haystack.dataclasses import ByteStream
10
13
 
11
14
  from haystack_integrations.components.converters.docling import DoclingConverter, ExportType
15
+ from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
12
16
 
13
17
 
14
18
  def test_run_doc_chunks_minimal() -> None:
@@ -356,13 +360,84 @@ def test_run_with_bytestream_source() -> None:
356
360
 
357
361
  bytestream = ByteStream(data=b"%PDF-1.4 fake pdf content", meta={"file_path": "uploaded.pdf"})
358
362
 
359
- with patch("os.unlink"):
360
- result = converter.run(sources=[bytestream])
363
+ result = converter.run(sources=[bytestream])
361
364
 
362
365
  documents = result["documents"]
363
366
  assert len(documents) == 1
364
367
  # ByteStream meta is merged into the output document
365
368
  assert documents[0].meta["file_path"] == "uploaded.pdf"
366
- # docling was called with a temp file path, not the ByteStream directly
369
+ # docling was called with a DocumentStream, not a temp file path
367
370
  call_args = converter_mock.convert.call_args
368
- assert call_args.kwargs["source"] != bytestream
371
+ passed_source = call_args.kwargs["source"]
372
+ assert isinstance(passed_source, DocumentStream)
373
+ assert passed_source.name == "uploaded.pdf"
374
+ assert isinstance(passed_source.stream, BytesIO)
375
+
376
+
377
+ class TestBytestreamToDocumentStream:
378
+ def test_uses_file_path(self) -> None:
379
+ bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"})
380
+ ds = _bytestream_to_document_stream(bs)
381
+ assert ds.name == "report.pdf"
382
+ assert ds.stream.read() == b"data"
383
+
384
+ def test_strips_directory_from_file_path(self) -> None:
385
+ bs = ByteStream(data=b"data", meta={"file_path": "/some/deep/path/report.pdf"})
386
+ ds = _bytestream_to_document_stream(bs)
387
+ assert ds.name == "report.pdf"
388
+
389
+ def test_uses_file_name_key(self) -> None:
390
+ bs = ByteStream(data=b"data", meta={"file_name": "slide-deck.pptx"})
391
+ ds = _bytestream_to_document_stream(bs)
392
+ assert ds.name == "slide-deck.pptx"
393
+
394
+ def test_uses_name_key(self) -> None:
395
+ bs = ByteStream(data=b"data", meta={"name": "notes.docx"})
396
+ ds = _bytestream_to_document_stream(bs)
397
+ assert ds.name == "notes.docx"
398
+
399
+ def test_file_path_takes_priority_over_file_name(self) -> None:
400
+ bs = ByteStream(data=b"data", meta={"file_path": "real.pdf", "file_name": "other.pdf"})
401
+ ds = _bytestream_to_document_stream(bs)
402
+ assert ds.name == "real.pdf"
403
+
404
+ def test_file_name_takes_priority_over_name(self) -> None:
405
+ bs = ByteStream(data=b"data", meta={"file_name": "chosen.pdf", "name": "ignored.pdf"})
406
+ ds = _bytestream_to_document_stream(bs)
407
+ assert ds.name == "chosen.pdf"
408
+
409
+ def test_guesses_extension_from_mime_type(self) -> None:
410
+ mime = "application/pdf"
411
+ expected_ext = mimetypes.guess_extension(mime)
412
+ bs = ByteStream(data=b"data", meta={"file_path": "report"}, mime_type=mime)
413
+ ds = _bytestream_to_document_stream(bs)
414
+ assert ds.name == f"report{expected_ext}"
415
+
416
+ def test_keeps_extension_when_present(self) -> None:
417
+ # mime_type should not override an already-present extension
418
+ bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"}, mime_type="text/plain")
419
+ ds = _bytestream_to_document_stream(bs)
420
+ assert ds.name == "report.pdf"
421
+
422
+ def test_no_meta_no_mime_type(self) -> None:
423
+ bs = ByteStream(data=b"data")
424
+ ds = _bytestream_to_document_stream(bs)
425
+ assert ds.name == "document"
426
+
427
+ def test_no_meta_with_mime_type(self) -> None:
428
+ mime = "application/pdf"
429
+ expected_ext = mimetypes.guess_extension(mime)
430
+ bs = ByteStream(data=b"data", mime_type=mime)
431
+ ds = _bytestream_to_document_stream(bs)
432
+ assert ds.name == f"document{expected_ext}"
433
+
434
+ def test_empty_meta_no_mime_type(self) -> None:
435
+ bs = ByteStream(data=b"data", meta={})
436
+ ds = _bytestream_to_document_stream(bs)
437
+ assert ds.name == "document"
438
+
439
+ def test_returns_document_stream_with_bytesio(self) -> None:
440
+ bs = ByteStream(data=b"hello", meta={"file_path": "f.pdf"})
441
+ ds = _bytestream_to_document_stream(bs)
442
+ assert isinstance(ds, DocumentStream)
443
+ assert isinstance(ds.stream, BytesIO)