docling-haystack 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_haystack-0.3.0/CHANGELOG.md +13 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/PKG-INFO +1 -1
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/src/haystack_integrations/components/converters/docling/converter.py +28 -10
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/tests/test_converter.py +80 -5
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/.gitignore +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/LICENSE.txt +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/README.md +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/pydoc/config_docusaurus.yml +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/pyproject.toml +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/src/docling_haystack/__init__.py +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/src/docling_haystack/converter.py +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/src/haystack_integrations/components/converters/docling/__init__.py +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.3.0}/tests/__init__.py +0 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [integrations/docling-v0.2.0] - 2026-04-08
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- Add Docling document converter (#3066)
|
|
8
|
+
|
|
9
|
+
### 🚜 Refactor
|
|
10
|
+
|
|
11
|
+
- *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
|
|
12
|
+
|
|
13
|
+
<!-- generated by git-cliff -->
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Haystack integration for docling
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
"""Docling Haystack converter module."""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
-
import
|
|
5
|
-
import tempfile
|
|
4
|
+
import mimetypes
|
|
6
5
|
import warnings
|
|
7
6
|
from abc import ABC, abstractmethod
|
|
8
7
|
from enum import Enum
|
|
8
|
+
from io import BytesIO
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Any
|
|
11
11
|
|
|
12
|
+
from docling_core.types.io import DocumentStream
|
|
12
13
|
from haystack import Document, component
|
|
13
14
|
from haystack.components.converters.utils import normalize_metadata
|
|
14
15
|
from haystack.dataclasses import ByteStream
|
|
@@ -18,6 +19,29 @@ from docling.datamodel.document import DoclingDocument
|
|
|
18
19
|
from docling.document_converter import DocumentConverter
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
|
|
23
|
+
"""
|
|
24
|
+
Build a `DocumentStream` from a Haystack `ByteStream`.
|
|
25
|
+
|
|
26
|
+
Resolves the stream name by checking common metadata keys (`file_path`, `file_name`, `name`) and falling back to
|
|
27
|
+
MIME-type extension guessing so that docling can reliably detect the input format.
|
|
28
|
+
"""
|
|
29
|
+
meta = source.meta or {}
|
|
30
|
+
raw_name = meta.get("file_path") or meta.get("file_name") or meta.get("name")
|
|
31
|
+
|
|
32
|
+
if raw_name:
|
|
33
|
+
name = Path(raw_name).name
|
|
34
|
+
else:
|
|
35
|
+
name = "document"
|
|
36
|
+
|
|
37
|
+
if not Path(name).suffix and source.mime_type:
|
|
38
|
+
ext = mimetypes.guess_extension(source.mime_type)
|
|
39
|
+
if ext:
|
|
40
|
+
name = f"{name}{ext}"
|
|
41
|
+
|
|
42
|
+
return DocumentStream(name=name, stream=BytesIO(source.data))
|
|
43
|
+
|
|
44
|
+
|
|
21
45
|
class ExportType(str, Enum):
|
|
22
46
|
"""Enumeration of available export types."""
|
|
23
47
|
|
|
@@ -141,14 +165,8 @@ class DoclingConverter:
|
|
|
141
165
|
documents: list[Document] = []
|
|
142
166
|
for source, source_meta in zip(sources, meta_list, strict=True):
|
|
143
167
|
if isinstance(source, ByteStream):
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
tmp.write(source.data)
|
|
147
|
-
tmp_path = Path(tmp.name)
|
|
148
|
-
try:
|
|
149
|
-
dl_doc = self._converter_instance.convert(source=tmp_path, **self.convert_kwargs).document
|
|
150
|
-
finally:
|
|
151
|
-
os.unlink(tmp_path)
|
|
168
|
+
doc_stream = _bytestream_to_document_stream(source)
|
|
169
|
+
dl_doc = self._converter_instance.convert(source=doc_stream, **self.convert_kwargs).document
|
|
152
170
|
# merge ByteStream meta (e.g. file_path, mime_type) with user-supplied meta
|
|
153
171
|
merged_meta = {**(source.meta or {}), **source_meta}
|
|
154
172
|
else:
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import mimetypes
|
|
2
3
|
import warnings
|
|
4
|
+
from io import BytesIO
|
|
3
5
|
from types import SimpleNamespace
|
|
4
6
|
from typing import Any
|
|
5
|
-
from unittest.mock import MagicMock
|
|
7
|
+
from unittest.mock import MagicMock
|
|
6
8
|
|
|
7
9
|
import pytest
|
|
10
|
+
from docling_core.types.io import DocumentStream
|
|
8
11
|
from haystack.core.serialization import component_from_dict, component_to_dict
|
|
9
12
|
from haystack.dataclasses import ByteStream
|
|
10
13
|
|
|
11
14
|
from haystack_integrations.components.converters.docling import DoclingConverter, ExportType
|
|
15
|
+
from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
|
|
12
16
|
|
|
13
17
|
|
|
14
18
|
def test_run_doc_chunks_minimal() -> None:
|
|
@@ -356,13 +360,84 @@ def test_run_with_bytestream_source() -> None:
|
|
|
356
360
|
|
|
357
361
|
bytestream = ByteStream(data=b"%PDF-1.4 fake pdf content", meta={"file_path": "uploaded.pdf"})
|
|
358
362
|
|
|
359
|
-
|
|
360
|
-
result = converter.run(sources=[bytestream])
|
|
363
|
+
result = converter.run(sources=[bytestream])
|
|
361
364
|
|
|
362
365
|
documents = result["documents"]
|
|
363
366
|
assert len(documents) == 1
|
|
364
367
|
# ByteStream meta is merged into the output document
|
|
365
368
|
assert documents[0].meta["file_path"] == "uploaded.pdf"
|
|
366
|
-
# docling was called with a
|
|
369
|
+
# docling was called with a DocumentStream, not a temp file path
|
|
367
370
|
call_args = converter_mock.convert.call_args
|
|
368
|
-
|
|
371
|
+
passed_source = call_args.kwargs["source"]
|
|
372
|
+
assert isinstance(passed_source, DocumentStream)
|
|
373
|
+
assert passed_source.name == "uploaded.pdf"
|
|
374
|
+
assert isinstance(passed_source.stream, BytesIO)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
class TestBytestreamToDocumentStream:
|
|
378
|
+
def test_uses_file_path(self) -> None:
|
|
379
|
+
bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"})
|
|
380
|
+
ds = _bytestream_to_document_stream(bs)
|
|
381
|
+
assert ds.name == "report.pdf"
|
|
382
|
+
assert ds.stream.read() == b"data"
|
|
383
|
+
|
|
384
|
+
def test_strips_directory_from_file_path(self) -> None:
|
|
385
|
+
bs = ByteStream(data=b"data", meta={"file_path": "/some/deep/path/report.pdf"})
|
|
386
|
+
ds = _bytestream_to_document_stream(bs)
|
|
387
|
+
assert ds.name == "report.pdf"
|
|
388
|
+
|
|
389
|
+
def test_uses_file_name_key(self) -> None:
|
|
390
|
+
bs = ByteStream(data=b"data", meta={"file_name": "slide-deck.pptx"})
|
|
391
|
+
ds = _bytestream_to_document_stream(bs)
|
|
392
|
+
assert ds.name == "slide-deck.pptx"
|
|
393
|
+
|
|
394
|
+
def test_uses_name_key(self) -> None:
|
|
395
|
+
bs = ByteStream(data=b"data", meta={"name": "notes.docx"})
|
|
396
|
+
ds = _bytestream_to_document_stream(bs)
|
|
397
|
+
assert ds.name == "notes.docx"
|
|
398
|
+
|
|
399
|
+
def test_file_path_takes_priority_over_file_name(self) -> None:
|
|
400
|
+
bs = ByteStream(data=b"data", meta={"file_path": "real.pdf", "file_name": "other.pdf"})
|
|
401
|
+
ds = _bytestream_to_document_stream(bs)
|
|
402
|
+
assert ds.name == "real.pdf"
|
|
403
|
+
|
|
404
|
+
def test_file_name_takes_priority_over_name(self) -> None:
|
|
405
|
+
bs = ByteStream(data=b"data", meta={"file_name": "chosen.pdf", "name": "ignored.pdf"})
|
|
406
|
+
ds = _bytestream_to_document_stream(bs)
|
|
407
|
+
assert ds.name == "chosen.pdf"
|
|
408
|
+
|
|
409
|
+
def test_guesses_extension_from_mime_type(self) -> None:
|
|
410
|
+
mime = "application/pdf"
|
|
411
|
+
expected_ext = mimetypes.guess_extension(mime)
|
|
412
|
+
bs = ByteStream(data=b"data", meta={"file_path": "report"}, mime_type=mime)
|
|
413
|
+
ds = _bytestream_to_document_stream(bs)
|
|
414
|
+
assert ds.name == f"report{expected_ext}"
|
|
415
|
+
|
|
416
|
+
def test_keeps_extension_when_present(self) -> None:
|
|
417
|
+
# mime_type should not override an already-present extension
|
|
418
|
+
bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"}, mime_type="text/plain")
|
|
419
|
+
ds = _bytestream_to_document_stream(bs)
|
|
420
|
+
assert ds.name == "report.pdf"
|
|
421
|
+
|
|
422
|
+
def test_no_meta_no_mime_type(self) -> None:
|
|
423
|
+
bs = ByteStream(data=b"data")
|
|
424
|
+
ds = _bytestream_to_document_stream(bs)
|
|
425
|
+
assert ds.name == "document"
|
|
426
|
+
|
|
427
|
+
def test_no_meta_with_mime_type(self) -> None:
|
|
428
|
+
mime = "application/pdf"
|
|
429
|
+
expected_ext = mimetypes.guess_extension(mime)
|
|
430
|
+
bs = ByteStream(data=b"data", mime_type=mime)
|
|
431
|
+
ds = _bytestream_to_document_stream(bs)
|
|
432
|
+
assert ds.name == f"document{expected_ext}"
|
|
433
|
+
|
|
434
|
+
def test_empty_meta_no_mime_type(self) -> None:
|
|
435
|
+
bs = ByteStream(data=b"data", meta={})
|
|
436
|
+
ds = _bytestream_to_document_stream(bs)
|
|
437
|
+
assert ds.name == "document"
|
|
438
|
+
|
|
439
|
+
def test_returns_document_stream_with_bytesio(self) -> None:
|
|
440
|
+
bs = ByteStream(data=b"hello", meta={"file_path": "f.pdf"})
|
|
441
|
+
ds = _bytestream_to_document_stream(bs)
|
|
442
|
+
assert isinstance(ds, DocumentStream)
|
|
443
|
+
assert isinstance(ds.stream, BytesIO)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|