docling-haystack 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ ## [integrations/docling-v0.3.0] - 2026-04-10
4
+
5
+ ### 🚀 Features
6
+
7
+ - (docling) Drop temp files for ByteStream sources (#3130)
8
+
9
+
10
+ ## [integrations/docling-v0.2.0] - 2026-04-08
11
+
12
+ ### 🚀 Features
13
+
14
+ - Add Docling document converter (#3066)
15
+
16
+ ### 🚜 Refactor
17
+
18
+ - *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
19
+
20
+ <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-haystack
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: Haystack integration for docling
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -1,22 +1,50 @@
1
1
  """Docling Haystack converter module."""
2
2
 
3
3
  import json
4
- import os
5
- import tempfile
4
+ import mimetypes
6
5
  import warnings
7
6
  from abc import ABC, abstractmethod
8
7
  from enum import Enum
8
+ from io import BytesIO
9
9
  from pathlib import Path
10
10
  from typing import Any
11
11
 
12
- from haystack import Document, component
12
+ from docling_core.types.io import DocumentStream
13
+ from haystack import Document, component, logging
13
14
  from haystack.components.converters.utils import normalize_metadata
15
+ from haystack.core.serialization import default_from_dict, default_to_dict
14
16
  from haystack.dataclasses import ByteStream
17
+ from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
15
18
 
16
19
  from docling.chunking import BaseChunk, BaseChunker, HybridChunker
17
20
  from docling.datamodel.document import DoclingDocument
18
21
  from docling.document_converter import DocumentConverter
19
22
 
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
27
+ """
28
+ Build a `DocumentStream` from a Haystack `ByteStream`.
29
+
30
+ Resolves the stream name by checking common metadata keys (`file_path`, `file_name`, `name`) and falling back to
31
+ MIME-type extension guessing so that docling can reliably detect the input format.
32
+ """
33
+ meta = source.meta or {}
34
+ raw_name = meta.get("file_path") or meta.get("file_name") or meta.get("name")
35
+
36
+ if raw_name:
37
+ name = Path(raw_name).name
38
+ else:
39
+ name = "document"
40
+
41
+ if not Path(name).suffix and source.mime_type:
42
+ ext = mimetypes.guess_extension(source.mime_type)
43
+ if ext:
44
+ name = f"{name}{ext}"
45
+
46
+ return DocumentStream(name=name, stream=BytesIO(source.data))
47
+
20
48
 
21
49
  class ExportType(str, Enum):
22
50
  """Enumeration of available export types."""
@@ -39,6 +67,15 @@ class BaseMetaExtractor(ABC):
39
67
  """Extract Docling document meta."""
40
68
  raise NotImplementedError()
41
69
 
70
+ def to_dict(self) -> dict[str, Any]:
71
+ """Serialize to a dictionary."""
72
+ return {}
73
+
74
+ @classmethod
75
+ def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003
76
+ """Deserialize from a dictionary."""
77
+ return cls()
78
+
42
79
 
43
80
  class MetaExtractor(BaseMetaExtractor):
44
81
  """MetaExtractor."""
@@ -99,6 +136,53 @@ class DoclingConverter:
99
136
  self._chunker_instance = chunker or HybridChunker()
100
137
  self._meta_extractor_instance = meta_extractor or MetaExtractor()
101
138
 
139
+ def to_dict(self) -> dict[str, Any]:
140
+ """Serialize this component to a dictionary."""
141
+ if self.converter is not None:
142
+ logger.warning(
143
+ "DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
144
+ "The component will use the default DocumentConverter when restored from the serialized form."
145
+ )
146
+ if self.chunker is not None:
147
+ logger.warning(
148
+ "DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
149
+ "The component will use the default chunker when restored from the serialized form."
150
+ )
151
+
152
+ meta_extractor_data = None
153
+ if self.meta_extractor is not None:
154
+ meta_extractor_data = serialize_class_instance(self.meta_extractor)
155
+
156
+ return default_to_dict(
157
+ self,
158
+ converter=None,
159
+ convert_kwargs=self.convert_kwargs,
160
+ export_type=self.export_type.value,
161
+ md_export_kwargs=self.md_export_kwargs,
162
+ chunker=None,
163
+ meta_extractor=meta_extractor_data,
164
+ )
165
+
166
+ @classmethod
167
+ def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
168
+ """
169
+ Deserialize this component from a dictionary.
170
+
171
+ The `converter` and `chunker` parameters are not serializable and are always ignored during
172
+ deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
173
+ respectively.
174
+
175
+ :param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
176
+ :returns: A new `DoclingConverter` instance.
177
+ """
178
+ init_params = data.get("init_parameters", {})
179
+
180
+ meta_extractor_data = init_params.get("meta_extractor")
181
+ if meta_extractor_data is not None:
182
+ init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)
183
+
184
+ return default_from_dict(cls, data)
185
+
102
186
  @component.output_types(documents=list[Document])
103
187
  def run(
104
188
  self,
@@ -141,14 +225,8 @@ class DoclingConverter:
141
225
  documents: list[Document] = []
142
226
  for source, source_meta in zip(sources, meta_list, strict=True):
143
227
  if isinstance(source, ByteStream):
144
- # docling requires a file path; write ByteStream data to a temp file
145
- with tempfile.NamedTemporaryFile(delete=False) as tmp:
146
- tmp.write(source.data)
147
- tmp_path = Path(tmp.name)
148
- try:
149
- dl_doc = self._converter_instance.convert(source=tmp_path, **self.convert_kwargs).document
150
- finally:
151
- os.unlink(tmp_path)
228
+ doc_stream = _bytestream_to_document_stream(source)
229
+ dl_doc = self._converter_instance.convert(source=doc_stream, **self.convert_kwargs).document
152
230
  # merge ByteStream meta (e.g. file_path, mime_type) with user-supplied meta
153
231
  merged_meta = {**(source.meta or {}), **source_meta}
154
232
  else:
@@ -1,14 +1,23 @@
1
1
  import json
2
+ import mimetypes
2
3
  import warnings
4
+ from io import BytesIO
3
5
  from types import SimpleNamespace
4
6
  from typing import Any
5
- from unittest.mock import MagicMock, patch
7
+ from unittest.mock import MagicMock
6
8
 
7
9
  import pytest
8
- from haystack.core.serialization import component_from_dict, component_to_dict
10
+ from docling.chunking import HybridChunker
11
+ from docling.document_converter import DocumentConverter
12
+ from docling_core.types.io import DocumentStream
9
13
  from haystack.dataclasses import ByteStream
10
14
 
11
- from haystack_integrations.components.converters.docling import DoclingConverter, ExportType
15
+ from haystack_integrations.components.converters.docling import (
16
+ DoclingConverter,
17
+ ExportType,
18
+ MetaExtractor,
19
+ )
20
+ from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
12
21
 
13
22
 
14
23
  def test_run_doc_chunks_minimal() -> None:
@@ -126,8 +135,6 @@ def test_run_json_minimal() -> None:
126
135
 
127
136
 
128
137
  def test_legacy_import_path() -> None:
129
- import warnings
130
-
131
138
  with warnings.catch_warnings(record=True) as caught:
132
139
  warnings.simplefilter("always")
133
140
  from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
@@ -138,63 +145,59 @@ def test_legacy_import_path() -> None:
138
145
  )
139
146
 
140
147
 
141
- def test_component_from_dict_legacy_nulls() -> None:
142
- # Before the public-attribute refactor, default serialization couldn't find
143
- # the _-prefixed attributes and fell back to the init defaults, so
144
- # convert_kwargs and md_export_kwargs were always serialized as null.
145
- # Verify that such a serialized dict still deserializes correctly.
146
- legacy_data = {
148
+ def test_component_to_dict_defaults() -> None:
149
+ converter = DoclingConverter()
150
+ assert converter.to_dict() == {
147
151
  "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
148
152
  "init_parameters": {
149
153
  "converter": None,
150
- "convert_kwargs": None,
154
+ "convert_kwargs": {},
151
155
  "export_type": "doc_chunks",
152
- "md_export_kwargs": None,
156
+ "md_export_kwargs": {"image_placeholder": ""},
153
157
  "chunker": None,
154
158
  "meta_extractor": None,
155
159
  },
156
160
  }
157
- restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")
158
-
159
- assert restored.convert_kwargs == {}
160
- assert restored.md_export_kwargs == {"image_placeholder": ""}
161
- assert restored.export_type == ExportType.DOC_CHUNKS
162
- assert restored.converter is None
163
- assert restored.chunker is None
164
- assert restored.meta_extractor is None
165
-
166
-
167
- def test_component_to_dict_defaults() -> None:
168
- converter = DoclingConverter()
169
- data = component_to_dict(converter, "docling_converter")
170
-
171
- init_params = data["init_parameters"]
172
- assert init_params["converter"] is None
173
- assert init_params["convert_kwargs"] == {}
174
- assert init_params["export_type"] == ExportType.DOC_CHUNKS
175
- assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
176
- assert init_params["chunker"] is None
177
- assert init_params["meta_extractor"] is None
178
161
 
179
162
 
180
163
  def test_component_to_dict_custom_params() -> None:
181
164
  converter = DoclingConverter(
165
+ converter=DocumentConverter(),
182
166
  convert_kwargs={"raises_on_error": False},
183
167
  export_type=ExportType.MARKDOWN,
184
168
  md_export_kwargs={"image_placeholder": "[img]"},
169
+ meta_extractor=MetaExtractor(),
185
170
  )
186
- data = component_to_dict(converter, "docling_converter")
187
-
188
- init_params = data["init_parameters"]
189
- assert init_params["convert_kwargs"] == {"raises_on_error": False}
190
- assert init_params["export_type"] == ExportType.MARKDOWN
191
- assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"}
171
+ assert converter.to_dict() == {
172
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
173
+ "init_parameters": {
174
+ "converter": None,
175
+ "convert_kwargs": {"raises_on_error": False},
176
+ "export_type": "markdown",
177
+ "md_export_kwargs": {"image_placeholder": "[img]"},
178
+ "chunker": None,
179
+ "meta_extractor": {
180
+ "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
181
+ "data": {},
182
+ },
183
+ },
184
+ }
192
185
 
193
186
 
194
187
  def test_component_from_dict_defaults() -> None:
195
- converter = DoclingConverter()
196
- data = component_to_dict(converter, "docling_converter")
197
- restored = component_from_dict(DoclingConverter, data, "docling_converter")
188
+ # null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
189
+ data = {
190
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
191
+ "init_parameters": {
192
+ "converter": None,
193
+ "convert_kwargs": None,
194
+ "export_type": "doc_chunks",
195
+ "md_export_kwargs": None,
196
+ "chunker": None,
197
+ "meta_extractor": None,
198
+ },
199
+ }
200
+ restored = DoclingConverter.from_dict(data)
198
201
 
199
202
  assert restored.converter is None
200
203
  assert restored.convert_kwargs == {}
@@ -205,17 +208,44 @@ def test_component_from_dict_defaults() -> None:
205
208
 
206
209
 
207
210
  def test_component_from_dict_custom_params() -> None:
208
- converter = DoclingConverter(
209
- convert_kwargs={"raises_on_error": False},
210
- export_type=ExportType.JSON,
211
- md_export_kwargs={"image_placeholder": "[img]"},
212
- )
213
- data = component_to_dict(converter, "docling_converter")
214
- restored = component_from_dict(DoclingConverter, data, "docling_converter")
211
+ data = {
212
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
213
+ "init_parameters": {
214
+ "converter": None,
215
+ "convert_kwargs": {"raises_on_error": False},
216
+ "export_type": "json",
217
+ "md_export_kwargs": {"image_placeholder": "[img]"},
218
+ "chunker": None,
219
+ "meta_extractor": {
220
+ "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
221
+ "data": {},
222
+ },
223
+ },
224
+ }
225
+ restored = DoclingConverter.from_dict(data)
215
226
 
227
+ assert restored.converter is None
216
228
  assert restored.convert_kwargs == {"raises_on_error": False}
217
229
  assert restored.export_type == ExportType.JSON
218
230
  assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
231
+ assert restored.chunker is None
232
+ assert isinstance(restored.meta_extractor, MetaExtractor)
233
+
234
+
235
+ def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
236
+ converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
237
+
238
+ assert converter.to_dict() == {
239
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
240
+ "init_parameters": {
241
+ "converter": None,
242
+ "convert_kwargs": {},
243
+ "export_type": "doc_chunks",
244
+ "md_export_kwargs": {"image_placeholder": ""},
245
+ "chunker": None,
246
+ "meta_extractor": None,
247
+ },
248
+ }
219
249
 
220
250
 
221
251
  def test_run_with_sources_parameter() -> None:
@@ -356,13 +386,129 @@ def test_run_with_bytestream_source() -> None:
356
386
 
357
387
  bytestream = ByteStream(data=b"%PDF-1.4 fake pdf content", meta={"file_path": "uploaded.pdf"})
358
388
 
359
- with patch("os.unlink"):
360
- result = converter.run(sources=[bytestream])
389
+ result = converter.run(sources=[bytestream])
361
390
 
362
391
  documents = result["documents"]
363
392
  assert len(documents) == 1
364
393
  # ByteStream meta is merged into the output document
365
394
  assert documents[0].meta["file_path"] == "uploaded.pdf"
366
- # docling was called with a temp file path, not the ByteStream directly
395
+ # docling was called with a DocumentStream, not a temp file path
367
396
  call_args = converter_mock.convert.call_args
368
- assert call_args.kwargs["source"] != bytestream
397
+ passed_source = call_args.kwargs["source"]
398
+ assert isinstance(passed_source, DocumentStream)
399
+ assert passed_source.name == "uploaded.pdf"
400
+ assert isinstance(passed_source.stream, BytesIO)
401
+
402
+
403
+ class TestBytestreamToDocumentStream:
404
+ def test_uses_file_path(self) -> None:
405
+ bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"})
406
+ ds = _bytestream_to_document_stream(bs)
407
+ assert ds.name == "report.pdf"
408
+ assert ds.stream.read() == b"data"
409
+
410
+ def test_strips_directory_from_file_path(self) -> None:
411
+ bs = ByteStream(data=b"data", meta={"file_path": "/some/deep/path/report.pdf"})
412
+ ds = _bytestream_to_document_stream(bs)
413
+ assert ds.name == "report.pdf"
414
+
415
+ def test_uses_file_name_key(self) -> None:
416
+ bs = ByteStream(data=b"data", meta={"file_name": "slide-deck.pptx"})
417
+ ds = _bytestream_to_document_stream(bs)
418
+ assert ds.name == "slide-deck.pptx"
419
+
420
+ def test_uses_name_key(self) -> None:
421
+ bs = ByteStream(data=b"data", meta={"name": "notes.docx"})
422
+ ds = _bytestream_to_document_stream(bs)
423
+ assert ds.name == "notes.docx"
424
+
425
+ def test_file_path_takes_priority_over_file_name(self) -> None:
426
+ bs = ByteStream(data=b"data", meta={"file_path": "real.pdf", "file_name": "other.pdf"})
427
+ ds = _bytestream_to_document_stream(bs)
428
+ assert ds.name == "real.pdf"
429
+
430
+ def test_file_name_takes_priority_over_name(self) -> None:
431
+ bs = ByteStream(data=b"data", meta={"file_name": "chosen.pdf", "name": "ignored.pdf"})
432
+ ds = _bytestream_to_document_stream(bs)
433
+ assert ds.name == "chosen.pdf"
434
+
435
+ def test_guesses_extension_from_mime_type(self) -> None:
436
+ mime = "application/pdf"
437
+ expected_ext = mimetypes.guess_extension(mime)
438
+ bs = ByteStream(data=b"data", meta={"file_path": "report"}, mime_type=mime)
439
+ ds = _bytestream_to_document_stream(bs)
440
+ assert ds.name == f"report{expected_ext}"
441
+
442
+ def test_keeps_extension_when_present(self) -> None:
443
+ # mime_type should not override an already-present extension
444
+ bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"}, mime_type="text/plain")
445
+ ds = _bytestream_to_document_stream(bs)
446
+ assert ds.name == "report.pdf"
447
+
448
+ def test_no_meta_no_mime_type(self) -> None:
449
+ bs = ByteStream(data=b"data")
450
+ ds = _bytestream_to_document_stream(bs)
451
+ assert ds.name == "document"
452
+
453
+ def test_no_meta_with_mime_type(self) -> None:
454
+ mime = "application/pdf"
455
+ expected_ext = mimetypes.guess_extension(mime)
456
+ bs = ByteStream(data=b"data", mime_type=mime)
457
+ ds = _bytestream_to_document_stream(bs)
458
+ assert ds.name == f"document{expected_ext}"
459
+
460
+ def test_empty_meta_no_mime_type(self) -> None:
461
+ bs = ByteStream(data=b"data", meta={})
462
+ ds = _bytestream_to_document_stream(bs)
463
+ assert ds.name == "document"
464
+
465
+ def test_returns_document_stream_with_bytesio(self) -> None:
466
+ bs = ByteStream(data=b"hello", meta={"file_path": "f.pdf"})
467
+ ds = _bytestream_to_document_stream(bs)
468
+ assert isinstance(ds, DocumentStream)
469
+ assert isinstance(ds.stream, BytesIO)
470
+
471
+ def test_unknown_mime_type_keeps_base_name(self) -> None:
472
+ # mimetypes.guess_extension returns None for unknown types, so the name stays as-is.
473
+ assert mimetypes.guess_extension("application/x-totally-made-up-type") is None
474
+ bs = ByteStream(
475
+ data=b"data",
476
+ meta={"file_path": "report"},
477
+ mime_type="application/x-totally-made-up-type",
478
+ )
479
+ ds = _bytestream_to_document_stream(bs)
480
+ assert ds.name == "report"
481
+
482
+
483
+ class TestMetaExtractor:
484
+ def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
485
+ chunk = MagicMock()
486
+ chunk.export_json_dict.return_value = {"some": "dict"}
487
+
488
+ result = MetaExtractor().extract_chunk_meta(chunk=chunk)
489
+
490
+ assert result == {"dl_meta": {"some": "dict"}}
491
+ chunk.export_json_dict.assert_called_once_with()
492
+
493
+ def test_extract_dl_doc_meta_with_origin(self) -> None:
494
+ dl_doc = MagicMock()
495
+ dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
496
+
497
+ result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
498
+
499
+ assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}}
500
+ dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True)
501
+
502
+ def test_extract_dl_doc_meta_without_origin(self) -> None:
503
+ dl_doc = MagicMock()
504
+ dl_doc.origin = None
505
+
506
+ result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
507
+
508
+ assert result == {}
509
+
510
+
511
+ def test_run_without_sources_or_paths_raises_value_error() -> None:
512
+ converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
513
+ with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
514
+ converter.run()