docling-haystack 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/CHANGELOG.md +7 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/PKG-INFO +1 -1
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/src/haystack_integrations/components/converters/docling/converter.py +61 -1
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/tests/test_converter.py +120 -49
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/.gitignore +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/LICENSE.txt +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/README.md +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/pydoc/config_docusaurus.yml +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/pyproject.toml +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/src/docling_haystack/__init__.py +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/src/docling_haystack/converter.py +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/src/haystack_integrations/components/converters/docling/__init__.py +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {docling_haystack-0.3.0 → docling_haystack-0.4.0}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Haystack integration for docling
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -10,14 +10,18 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any
|
|
11
11
|
|
|
12
12
|
from docling_core.types.io import DocumentStream
|
|
13
|
-
from haystack import Document, component
|
|
13
|
+
from haystack import Document, component, logging
|
|
14
14
|
from haystack.components.converters.utils import normalize_metadata
|
|
15
|
+
from haystack.core.serialization import default_from_dict, default_to_dict
|
|
15
16
|
from haystack.dataclasses import ByteStream
|
|
17
|
+
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
|
|
16
18
|
|
|
17
19
|
from docling.chunking import BaseChunk, BaseChunker, HybridChunker
|
|
18
20
|
from docling.datamodel.document import DoclingDocument
|
|
19
21
|
from docling.document_converter import DocumentConverter
|
|
20
22
|
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
|
|
23
27
|
"""
|
|
@@ -63,6 +67,15 @@ class BaseMetaExtractor(ABC):
|
|
|
63
67
|
"""Extract Docling document meta."""
|
|
64
68
|
raise NotImplementedError()
|
|
65
69
|
|
|
70
|
+
def to_dict(self) -> dict[str, Any]:
|
|
71
|
+
"""Serialize to a dictionary."""
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003
|
|
76
|
+
"""Deserialize from a dictionary."""
|
|
77
|
+
return cls()
|
|
78
|
+
|
|
66
79
|
|
|
67
80
|
class MetaExtractor(BaseMetaExtractor):
|
|
68
81
|
"""MetaExtractor."""
|
|
@@ -123,6 +136,53 @@ class DoclingConverter:
|
|
|
123
136
|
self._chunker_instance = chunker or HybridChunker()
|
|
124
137
|
self._meta_extractor_instance = meta_extractor or MetaExtractor()
|
|
125
138
|
|
|
139
|
+
def to_dict(self) -> dict[str, Any]:
|
|
140
|
+
"""Serialize this component to a dictionary."""
|
|
141
|
+
if self.converter is not None:
|
|
142
|
+
logger.warning(
|
|
143
|
+
"DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
|
|
144
|
+
"The component will use the default DocumentConverter when restored from the serialized form."
|
|
145
|
+
)
|
|
146
|
+
if self.chunker is not None:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
|
|
149
|
+
"The component will use the default chunker when restored from the serialized form."
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
meta_extractor_data = None
|
|
153
|
+
if self.meta_extractor is not None:
|
|
154
|
+
meta_extractor_data = serialize_class_instance(self.meta_extractor)
|
|
155
|
+
|
|
156
|
+
return default_to_dict(
|
|
157
|
+
self,
|
|
158
|
+
converter=None,
|
|
159
|
+
convert_kwargs=self.convert_kwargs,
|
|
160
|
+
export_type=self.export_type.value,
|
|
161
|
+
md_export_kwargs=self.md_export_kwargs,
|
|
162
|
+
chunker=None,
|
|
163
|
+
meta_extractor=meta_extractor_data,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
@classmethod
|
|
167
|
+
def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
|
|
168
|
+
"""
|
|
169
|
+
Deserialize this component from a dictionary.
|
|
170
|
+
|
|
171
|
+
The `converter` and `chunker` parameters are not serializable and are always ignored during
|
|
172
|
+
deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
|
|
173
|
+
respectively.
|
|
174
|
+
|
|
175
|
+
:param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
|
|
176
|
+
:returns: A new `DoclingConverter` instance.
|
|
177
|
+
"""
|
|
178
|
+
init_params = data.get("init_parameters", {})
|
|
179
|
+
|
|
180
|
+
meta_extractor_data = init_params.get("meta_extractor")
|
|
181
|
+
if meta_extractor_data is not None:
|
|
182
|
+
init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)
|
|
183
|
+
|
|
184
|
+
return default_from_dict(cls, data)
|
|
185
|
+
|
|
126
186
|
@component.output_types(documents=list[Document])
|
|
127
187
|
def run(
|
|
128
188
|
self,
|
|
@@ -7,11 +7,16 @@ from typing import Any
|
|
|
7
7
|
from unittest.mock import MagicMock
|
|
8
8
|
|
|
9
9
|
import pytest
|
|
10
|
+
from docling.chunking import HybridChunker
|
|
11
|
+
from docling.document_converter import DocumentConverter
|
|
10
12
|
from docling_core.types.io import DocumentStream
|
|
11
|
-
from haystack.core.serialization import component_from_dict, component_to_dict
|
|
12
13
|
from haystack.dataclasses import ByteStream
|
|
13
14
|
|
|
14
|
-
from haystack_integrations.components.converters.docling import
|
|
15
|
+
from haystack_integrations.components.converters.docling import (
|
|
16
|
+
DoclingConverter,
|
|
17
|
+
ExportType,
|
|
18
|
+
MetaExtractor,
|
|
19
|
+
)
|
|
15
20
|
from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
|
|
16
21
|
|
|
17
22
|
|
|
@@ -130,8 +135,6 @@ def test_run_json_minimal() -> None:
|
|
|
130
135
|
|
|
131
136
|
|
|
132
137
|
def test_legacy_import_path() -> None:
|
|
133
|
-
import warnings
|
|
134
|
-
|
|
135
138
|
with warnings.catch_warnings(record=True) as caught:
|
|
136
139
|
warnings.simplefilter("always")
|
|
137
140
|
from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
|
|
@@ -142,63 +145,59 @@ def test_legacy_import_path() -> None:
|
|
|
142
145
|
)
|
|
143
146
|
|
|
144
147
|
|
|
145
|
-
def
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# convert_kwargs and md_export_kwargs were always serialized as null.
|
|
149
|
-
# Verify that such a serialized dict still deserializes correctly.
|
|
150
|
-
legacy_data = {
|
|
148
|
+
def test_component_to_dict_defaults() -> None:
|
|
149
|
+
converter = DoclingConverter()
|
|
150
|
+
assert converter.to_dict() == {
|
|
151
151
|
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
152
152
|
"init_parameters": {
|
|
153
153
|
"converter": None,
|
|
154
|
-
"convert_kwargs":
|
|
154
|
+
"convert_kwargs": {},
|
|
155
155
|
"export_type": "doc_chunks",
|
|
156
|
-
"md_export_kwargs":
|
|
156
|
+
"md_export_kwargs": {"image_placeholder": ""},
|
|
157
157
|
"chunker": None,
|
|
158
158
|
"meta_extractor": None,
|
|
159
159
|
},
|
|
160
160
|
}
|
|
161
|
-
restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")
|
|
162
|
-
|
|
163
|
-
assert restored.convert_kwargs == {}
|
|
164
|
-
assert restored.md_export_kwargs == {"image_placeholder": ""}
|
|
165
|
-
assert restored.export_type == ExportType.DOC_CHUNKS
|
|
166
|
-
assert restored.converter is None
|
|
167
|
-
assert restored.chunker is None
|
|
168
|
-
assert restored.meta_extractor is None
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def test_component_to_dict_defaults() -> None:
|
|
172
|
-
converter = DoclingConverter()
|
|
173
|
-
data = component_to_dict(converter, "docling_converter")
|
|
174
|
-
|
|
175
|
-
init_params = data["init_parameters"]
|
|
176
|
-
assert init_params["converter"] is None
|
|
177
|
-
assert init_params["convert_kwargs"] == {}
|
|
178
|
-
assert init_params["export_type"] == ExportType.DOC_CHUNKS
|
|
179
|
-
assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
|
|
180
|
-
assert init_params["chunker"] is None
|
|
181
|
-
assert init_params["meta_extractor"] is None
|
|
182
161
|
|
|
183
162
|
|
|
184
163
|
def test_component_to_dict_custom_params() -> None:
|
|
185
164
|
converter = DoclingConverter(
|
|
165
|
+
converter=DocumentConverter(),
|
|
186
166
|
convert_kwargs={"raises_on_error": False},
|
|
187
167
|
export_type=ExportType.MARKDOWN,
|
|
188
168
|
md_export_kwargs={"image_placeholder": "[img]"},
|
|
169
|
+
meta_extractor=MetaExtractor(),
|
|
189
170
|
)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
171
|
+
assert converter.to_dict() == {
|
|
172
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
173
|
+
"init_parameters": {
|
|
174
|
+
"converter": None,
|
|
175
|
+
"convert_kwargs": {"raises_on_error": False},
|
|
176
|
+
"export_type": "markdown",
|
|
177
|
+
"md_export_kwargs": {"image_placeholder": "[img]"},
|
|
178
|
+
"chunker": None,
|
|
179
|
+
"meta_extractor": {
|
|
180
|
+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
|
|
181
|
+
"data": {},
|
|
182
|
+
},
|
|
183
|
+
},
|
|
184
|
+
}
|
|
196
185
|
|
|
197
186
|
|
|
198
187
|
def test_component_from_dict_defaults() -> None:
|
|
199
|
-
|
|
200
|
-
data =
|
|
201
|
-
|
|
188
|
+
# null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
|
|
189
|
+
data = {
|
|
190
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
191
|
+
"init_parameters": {
|
|
192
|
+
"converter": None,
|
|
193
|
+
"convert_kwargs": None,
|
|
194
|
+
"export_type": "doc_chunks",
|
|
195
|
+
"md_export_kwargs": None,
|
|
196
|
+
"chunker": None,
|
|
197
|
+
"meta_extractor": None,
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
restored = DoclingConverter.from_dict(data)
|
|
202
201
|
|
|
203
202
|
assert restored.converter is None
|
|
204
203
|
assert restored.convert_kwargs == {}
|
|
@@ -209,17 +208,44 @@ def test_component_from_dict_defaults() -> None:
|
|
|
209
208
|
|
|
210
209
|
|
|
211
210
|
def test_component_from_dict_custom_params() -> None:
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
211
|
+
data = {
|
|
212
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
213
|
+
"init_parameters": {
|
|
214
|
+
"converter": None,
|
|
215
|
+
"convert_kwargs": {"raises_on_error": False},
|
|
216
|
+
"export_type": "json",
|
|
217
|
+
"md_export_kwargs": {"image_placeholder": "[img]"},
|
|
218
|
+
"chunker": None,
|
|
219
|
+
"meta_extractor": {
|
|
220
|
+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
|
|
221
|
+
"data": {},
|
|
222
|
+
},
|
|
223
|
+
},
|
|
224
|
+
}
|
|
225
|
+
restored = DoclingConverter.from_dict(data)
|
|
219
226
|
|
|
227
|
+
assert restored.converter is None
|
|
220
228
|
assert restored.convert_kwargs == {"raises_on_error": False}
|
|
221
229
|
assert restored.export_type == ExportType.JSON
|
|
222
230
|
assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
|
|
231
|
+
assert restored.chunker is None
|
|
232
|
+
assert isinstance(restored.meta_extractor, MetaExtractor)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
|
|
236
|
+
converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
|
|
237
|
+
|
|
238
|
+
assert converter.to_dict() == {
|
|
239
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
240
|
+
"init_parameters": {
|
|
241
|
+
"converter": None,
|
|
242
|
+
"convert_kwargs": {},
|
|
243
|
+
"export_type": "doc_chunks",
|
|
244
|
+
"md_export_kwargs": {"image_placeholder": ""},
|
|
245
|
+
"chunker": None,
|
|
246
|
+
"meta_extractor": None,
|
|
247
|
+
},
|
|
248
|
+
}
|
|
223
249
|
|
|
224
250
|
|
|
225
251
|
def test_run_with_sources_parameter() -> None:
|
|
@@ -441,3 +467,48 @@ class TestBytestreamToDocumentStream:
|
|
|
441
467
|
ds = _bytestream_to_document_stream(bs)
|
|
442
468
|
assert isinstance(ds, DocumentStream)
|
|
443
469
|
assert isinstance(ds.stream, BytesIO)
|
|
470
|
+
|
|
471
|
+
def test_unknown_mime_type_keeps_base_name(self) -> None:
|
|
472
|
+
# mimetypes.guess_extension returns None for unknown types, so the name stays as-is.
|
|
473
|
+
assert mimetypes.guess_extension("application/x-totally-made-up-type") is None
|
|
474
|
+
bs = ByteStream(
|
|
475
|
+
data=b"data",
|
|
476
|
+
meta={"file_path": "report"},
|
|
477
|
+
mime_type="application/x-totally-made-up-type",
|
|
478
|
+
)
|
|
479
|
+
ds = _bytestream_to_document_stream(bs)
|
|
480
|
+
assert ds.name == "report"
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
class TestMetaExtractor:
|
|
484
|
+
def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
|
|
485
|
+
chunk = MagicMock()
|
|
486
|
+
chunk.export_json_dict.return_value = {"some": "dict"}
|
|
487
|
+
|
|
488
|
+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
489
|
+
|
|
490
|
+
assert result == {"dl_meta": {"some": "dict"}}
|
|
491
|
+
chunk.export_json_dict.assert_called_once_with()
|
|
492
|
+
|
|
493
|
+
def test_extract_dl_doc_meta_with_origin(self) -> None:
|
|
494
|
+
dl_doc = MagicMock()
|
|
495
|
+
dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
|
|
496
|
+
|
|
497
|
+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
|
|
498
|
+
|
|
499
|
+
assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}}
|
|
500
|
+
dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True)
|
|
501
|
+
|
|
502
|
+
def test_extract_dl_doc_meta_without_origin(self) -> None:
|
|
503
|
+
dl_doc = MagicMock()
|
|
504
|
+
dl_doc.origin = None
|
|
505
|
+
|
|
506
|
+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
|
|
507
|
+
|
|
508
|
+
assert result == {}
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def test_run_without_sources_or_paths_raises_value_error() -> None:
|
|
512
|
+
converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
|
|
513
|
+
with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
|
|
514
|
+
converter.run()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|