docling-haystack 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_haystack-0.4.0/CHANGELOG.md +20 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/PKG-INFO +1 -1
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/src/haystack_integrations/components/converters/docling/converter.py +89 -11
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/tests/test_converter.py +200 -54
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/.gitignore +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/LICENSE.txt +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/README.md +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/pydoc/config_docusaurus.yml +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/pyproject.toml +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/src/docling_haystack/__init__.py +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/src/docling_haystack/converter.py +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/src/haystack_integrations/components/converters/docling/__init__.py +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {docling_haystack-0.2.0 → docling_haystack-0.4.0}/tests/__init__.py +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [integrations/docling-v0.3.0] - 2026-04-10
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- (docling) Drop temp files for ByteStream sources (#3130)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
## [integrations/docling-v0.2.0] - 2026-04-08
|
|
11
|
+
|
|
12
|
+
### 🚀 Features
|
|
13
|
+
|
|
14
|
+
- Add Docling document converter (#3066)
|
|
15
|
+
|
|
16
|
+
### 🚜 Refactor
|
|
17
|
+
|
|
18
|
+
- *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
|
|
19
|
+
|
|
20
|
+
<!-- generated by git-cliff -->
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Haystack integration for docling
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -1,22 +1,50 @@
|
|
|
1
1
|
"""Docling Haystack converter module."""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
-
import
|
|
5
|
-
import tempfile
|
|
4
|
+
import mimetypes
|
|
6
5
|
import warnings
|
|
7
6
|
from abc import ABC, abstractmethod
|
|
8
7
|
from enum import Enum
|
|
8
|
+
from io import BytesIO
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Any
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from docling_core.types.io import DocumentStream
|
|
13
|
+
from haystack import Document, component, logging
|
|
13
14
|
from haystack.components.converters.utils import normalize_metadata
|
|
15
|
+
from haystack.core.serialization import default_from_dict, default_to_dict
|
|
14
16
|
from haystack.dataclasses import ByteStream
|
|
17
|
+
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
|
|
15
18
|
|
|
16
19
|
from docling.chunking import BaseChunk, BaseChunker, HybridChunker
|
|
17
20
|
from docling.datamodel.document import DoclingDocument
|
|
18
21
|
from docling.document_converter import DocumentConverter
|
|
19
22
|
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
|
|
27
|
+
"""
|
|
28
|
+
Build a `DocumentStream` from a Haystack `ByteStream`.
|
|
29
|
+
|
|
30
|
+
Resolves the stream name by checking common metadata keys (`file_path`, `file_name`, `name`) and falling back to
|
|
31
|
+
MIME-type extension guessing so that docling can reliably detect the input format.
|
|
32
|
+
"""
|
|
33
|
+
meta = source.meta or {}
|
|
34
|
+
raw_name = meta.get("file_path") or meta.get("file_name") or meta.get("name")
|
|
35
|
+
|
|
36
|
+
if raw_name:
|
|
37
|
+
name = Path(raw_name).name
|
|
38
|
+
else:
|
|
39
|
+
name = "document"
|
|
40
|
+
|
|
41
|
+
if not Path(name).suffix and source.mime_type:
|
|
42
|
+
ext = mimetypes.guess_extension(source.mime_type)
|
|
43
|
+
if ext:
|
|
44
|
+
name = f"{name}{ext}"
|
|
45
|
+
|
|
46
|
+
return DocumentStream(name=name, stream=BytesIO(source.data))
|
|
47
|
+
|
|
20
48
|
|
|
21
49
|
class ExportType(str, Enum):
|
|
22
50
|
"""Enumeration of available export types."""
|
|
@@ -39,6 +67,15 @@ class BaseMetaExtractor(ABC):
|
|
|
39
67
|
"""Extract Docling document meta."""
|
|
40
68
|
raise NotImplementedError()
|
|
41
69
|
|
|
70
|
+
def to_dict(self) -> dict[str, Any]:
|
|
71
|
+
"""Serialize to a dictionary."""
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003
|
|
76
|
+
"""Deserialize from a dictionary."""
|
|
77
|
+
return cls()
|
|
78
|
+
|
|
42
79
|
|
|
43
80
|
class MetaExtractor(BaseMetaExtractor):
|
|
44
81
|
"""MetaExtractor."""
|
|
@@ -99,6 +136,53 @@ class DoclingConverter:
|
|
|
99
136
|
self._chunker_instance = chunker or HybridChunker()
|
|
100
137
|
self._meta_extractor_instance = meta_extractor or MetaExtractor()
|
|
101
138
|
|
|
139
|
+
def to_dict(self) -> dict[str, Any]:
|
|
140
|
+
"""Serialize this component to a dictionary."""
|
|
141
|
+
if self.converter is not None:
|
|
142
|
+
logger.warning(
|
|
143
|
+
"DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
|
|
144
|
+
"The component will use the default DocumentConverter when restored from the serialized form."
|
|
145
|
+
)
|
|
146
|
+
if self.chunker is not None:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
|
|
149
|
+
"The component will use the default chunker when restored from the serialized form."
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
meta_extractor_data = None
|
|
153
|
+
if self.meta_extractor is not None:
|
|
154
|
+
meta_extractor_data = serialize_class_instance(self.meta_extractor)
|
|
155
|
+
|
|
156
|
+
return default_to_dict(
|
|
157
|
+
self,
|
|
158
|
+
converter=None,
|
|
159
|
+
convert_kwargs=self.convert_kwargs,
|
|
160
|
+
export_type=self.export_type.value,
|
|
161
|
+
md_export_kwargs=self.md_export_kwargs,
|
|
162
|
+
chunker=None,
|
|
163
|
+
meta_extractor=meta_extractor_data,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
@classmethod
|
|
167
|
+
def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
|
|
168
|
+
"""
|
|
169
|
+
Deserialize this component from a dictionary.
|
|
170
|
+
|
|
171
|
+
The `converter` and `chunker` parameters are not serializable and are always ignored during
|
|
172
|
+
deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
|
|
173
|
+
respectively.
|
|
174
|
+
|
|
175
|
+
:param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
|
|
176
|
+
:returns: A new `DoclingConverter` instance.
|
|
177
|
+
"""
|
|
178
|
+
init_params = data.get("init_parameters", {})
|
|
179
|
+
|
|
180
|
+
meta_extractor_data = init_params.get("meta_extractor")
|
|
181
|
+
if meta_extractor_data is not None:
|
|
182
|
+
init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)
|
|
183
|
+
|
|
184
|
+
return default_from_dict(cls, data)
|
|
185
|
+
|
|
102
186
|
@component.output_types(documents=list[Document])
|
|
103
187
|
def run(
|
|
104
188
|
self,
|
|
@@ -141,14 +225,8 @@ class DoclingConverter:
|
|
|
141
225
|
documents: list[Document] = []
|
|
142
226
|
for source, source_meta in zip(sources, meta_list, strict=True):
|
|
143
227
|
if isinstance(source, ByteStream):
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
tmp.write(source.data)
|
|
147
|
-
tmp_path = Path(tmp.name)
|
|
148
|
-
try:
|
|
149
|
-
dl_doc = self._converter_instance.convert(source=tmp_path, **self.convert_kwargs).document
|
|
150
|
-
finally:
|
|
151
|
-
os.unlink(tmp_path)
|
|
228
|
+
doc_stream = _bytestream_to_document_stream(source)
|
|
229
|
+
dl_doc = self._converter_instance.convert(source=doc_stream, **self.convert_kwargs).document
|
|
152
230
|
# merge ByteStream meta (e.g. file_path, mime_type) with user-supplied meta
|
|
153
231
|
merged_meta = {**(source.meta or {}), **source_meta}
|
|
154
232
|
else:
|
|
@@ -1,14 +1,23 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import mimetypes
|
|
2
3
|
import warnings
|
|
4
|
+
from io import BytesIO
|
|
3
5
|
from types import SimpleNamespace
|
|
4
6
|
from typing import Any
|
|
5
|
-
from unittest.mock import MagicMock
|
|
7
|
+
from unittest.mock import MagicMock
|
|
6
8
|
|
|
7
9
|
import pytest
|
|
8
|
-
from
|
|
10
|
+
from docling.chunking import HybridChunker
|
|
11
|
+
from docling.document_converter import DocumentConverter
|
|
12
|
+
from docling_core.types.io import DocumentStream
|
|
9
13
|
from haystack.dataclasses import ByteStream
|
|
10
14
|
|
|
11
|
-
from haystack_integrations.components.converters.docling import
|
|
15
|
+
from haystack_integrations.components.converters.docling import (
|
|
16
|
+
DoclingConverter,
|
|
17
|
+
ExportType,
|
|
18
|
+
MetaExtractor,
|
|
19
|
+
)
|
|
20
|
+
from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
|
|
12
21
|
|
|
13
22
|
|
|
14
23
|
def test_run_doc_chunks_minimal() -> None:
|
|
@@ -126,8 +135,6 @@ def test_run_json_minimal() -> None:
|
|
|
126
135
|
|
|
127
136
|
|
|
128
137
|
def test_legacy_import_path() -> None:
|
|
129
|
-
import warnings
|
|
130
|
-
|
|
131
138
|
with warnings.catch_warnings(record=True) as caught:
|
|
132
139
|
warnings.simplefilter("always")
|
|
133
140
|
from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
|
|
@@ -138,63 +145,59 @@ def test_legacy_import_path() -> None:
|
|
|
138
145
|
)
|
|
139
146
|
|
|
140
147
|
|
|
141
|
-
def
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# convert_kwargs and md_export_kwargs were always serialized as null.
|
|
145
|
-
# Verify that such a serialized dict still deserializes correctly.
|
|
146
|
-
legacy_data = {
|
|
148
|
+
def test_component_to_dict_defaults() -> None:
|
|
149
|
+
converter = DoclingConverter()
|
|
150
|
+
assert converter.to_dict() == {
|
|
147
151
|
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
148
152
|
"init_parameters": {
|
|
149
153
|
"converter": None,
|
|
150
|
-
"convert_kwargs":
|
|
154
|
+
"convert_kwargs": {},
|
|
151
155
|
"export_type": "doc_chunks",
|
|
152
|
-
"md_export_kwargs":
|
|
156
|
+
"md_export_kwargs": {"image_placeholder": ""},
|
|
153
157
|
"chunker": None,
|
|
154
158
|
"meta_extractor": None,
|
|
155
159
|
},
|
|
156
160
|
}
|
|
157
|
-
restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")
|
|
158
|
-
|
|
159
|
-
assert restored.convert_kwargs == {}
|
|
160
|
-
assert restored.md_export_kwargs == {"image_placeholder": ""}
|
|
161
|
-
assert restored.export_type == ExportType.DOC_CHUNKS
|
|
162
|
-
assert restored.converter is None
|
|
163
|
-
assert restored.chunker is None
|
|
164
|
-
assert restored.meta_extractor is None
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def test_component_to_dict_defaults() -> None:
|
|
168
|
-
converter = DoclingConverter()
|
|
169
|
-
data = component_to_dict(converter, "docling_converter")
|
|
170
|
-
|
|
171
|
-
init_params = data["init_parameters"]
|
|
172
|
-
assert init_params["converter"] is None
|
|
173
|
-
assert init_params["convert_kwargs"] == {}
|
|
174
|
-
assert init_params["export_type"] == ExportType.DOC_CHUNKS
|
|
175
|
-
assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
|
|
176
|
-
assert init_params["chunker"] is None
|
|
177
|
-
assert init_params["meta_extractor"] is None
|
|
178
161
|
|
|
179
162
|
|
|
180
163
|
def test_component_to_dict_custom_params() -> None:
|
|
181
164
|
converter = DoclingConverter(
|
|
165
|
+
converter=DocumentConverter(),
|
|
182
166
|
convert_kwargs={"raises_on_error": False},
|
|
183
167
|
export_type=ExportType.MARKDOWN,
|
|
184
168
|
md_export_kwargs={"image_placeholder": "[img]"},
|
|
169
|
+
meta_extractor=MetaExtractor(),
|
|
185
170
|
)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
171
|
+
assert converter.to_dict() == {
|
|
172
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
173
|
+
"init_parameters": {
|
|
174
|
+
"converter": None,
|
|
175
|
+
"convert_kwargs": {"raises_on_error": False},
|
|
176
|
+
"export_type": "markdown",
|
|
177
|
+
"md_export_kwargs": {"image_placeholder": "[img]"},
|
|
178
|
+
"chunker": None,
|
|
179
|
+
"meta_extractor": {
|
|
180
|
+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
|
|
181
|
+
"data": {},
|
|
182
|
+
},
|
|
183
|
+
},
|
|
184
|
+
}
|
|
192
185
|
|
|
193
186
|
|
|
194
187
|
def test_component_from_dict_defaults() -> None:
|
|
195
|
-
|
|
196
|
-
data =
|
|
197
|
-
|
|
188
|
+
# null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
|
|
189
|
+
data = {
|
|
190
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
191
|
+
"init_parameters": {
|
|
192
|
+
"converter": None,
|
|
193
|
+
"convert_kwargs": None,
|
|
194
|
+
"export_type": "doc_chunks",
|
|
195
|
+
"md_export_kwargs": None,
|
|
196
|
+
"chunker": None,
|
|
197
|
+
"meta_extractor": None,
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
restored = DoclingConverter.from_dict(data)
|
|
198
201
|
|
|
199
202
|
assert restored.converter is None
|
|
200
203
|
assert restored.convert_kwargs == {}
|
|
@@ -205,17 +208,44 @@ def test_component_from_dict_defaults() -> None:
|
|
|
205
208
|
|
|
206
209
|
|
|
207
210
|
def test_component_from_dict_custom_params() -> None:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
211
|
+
data = {
|
|
212
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
213
|
+
"init_parameters": {
|
|
214
|
+
"converter": None,
|
|
215
|
+
"convert_kwargs": {"raises_on_error": False},
|
|
216
|
+
"export_type": "json",
|
|
217
|
+
"md_export_kwargs": {"image_placeholder": "[img]"},
|
|
218
|
+
"chunker": None,
|
|
219
|
+
"meta_extractor": {
|
|
220
|
+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
|
|
221
|
+
"data": {},
|
|
222
|
+
},
|
|
223
|
+
},
|
|
224
|
+
}
|
|
225
|
+
restored = DoclingConverter.from_dict(data)
|
|
215
226
|
|
|
227
|
+
assert restored.converter is None
|
|
216
228
|
assert restored.convert_kwargs == {"raises_on_error": False}
|
|
217
229
|
assert restored.export_type == ExportType.JSON
|
|
218
230
|
assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
|
|
231
|
+
assert restored.chunker is None
|
|
232
|
+
assert isinstance(restored.meta_extractor, MetaExtractor)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
|
|
236
|
+
converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
|
|
237
|
+
|
|
238
|
+
assert converter.to_dict() == {
|
|
239
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
240
|
+
"init_parameters": {
|
|
241
|
+
"converter": None,
|
|
242
|
+
"convert_kwargs": {},
|
|
243
|
+
"export_type": "doc_chunks",
|
|
244
|
+
"md_export_kwargs": {"image_placeholder": ""},
|
|
245
|
+
"chunker": None,
|
|
246
|
+
"meta_extractor": None,
|
|
247
|
+
},
|
|
248
|
+
}
|
|
219
249
|
|
|
220
250
|
|
|
221
251
|
def test_run_with_sources_parameter() -> None:
|
|
@@ -356,13 +386,129 @@ def test_run_with_bytestream_source() -> None:
|
|
|
356
386
|
|
|
357
387
|
bytestream = ByteStream(data=b"%PDF-1.4 fake pdf content", meta={"file_path": "uploaded.pdf"})
|
|
358
388
|
|
|
359
|
-
|
|
360
|
-
result = converter.run(sources=[bytestream])
|
|
389
|
+
result = converter.run(sources=[bytestream])
|
|
361
390
|
|
|
362
391
|
documents = result["documents"]
|
|
363
392
|
assert len(documents) == 1
|
|
364
393
|
# ByteStream meta is merged into the output document
|
|
365
394
|
assert documents[0].meta["file_path"] == "uploaded.pdf"
|
|
366
|
-
# docling was called with a
|
|
395
|
+
# docling was called with a DocumentStream, not a temp file path
|
|
367
396
|
call_args = converter_mock.convert.call_args
|
|
368
|
-
|
|
397
|
+
passed_source = call_args.kwargs["source"]
|
|
398
|
+
assert isinstance(passed_source, DocumentStream)
|
|
399
|
+
assert passed_source.name == "uploaded.pdf"
|
|
400
|
+
assert isinstance(passed_source.stream, BytesIO)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
class TestBytestreamToDocumentStream:
|
|
404
|
+
def test_uses_file_path(self) -> None:
|
|
405
|
+
bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"})
|
|
406
|
+
ds = _bytestream_to_document_stream(bs)
|
|
407
|
+
assert ds.name == "report.pdf"
|
|
408
|
+
assert ds.stream.read() == b"data"
|
|
409
|
+
|
|
410
|
+
def test_strips_directory_from_file_path(self) -> None:
|
|
411
|
+
bs = ByteStream(data=b"data", meta={"file_path": "/some/deep/path/report.pdf"})
|
|
412
|
+
ds = _bytestream_to_document_stream(bs)
|
|
413
|
+
assert ds.name == "report.pdf"
|
|
414
|
+
|
|
415
|
+
def test_uses_file_name_key(self) -> None:
|
|
416
|
+
bs = ByteStream(data=b"data", meta={"file_name": "slide-deck.pptx"})
|
|
417
|
+
ds = _bytestream_to_document_stream(bs)
|
|
418
|
+
assert ds.name == "slide-deck.pptx"
|
|
419
|
+
|
|
420
|
+
def test_uses_name_key(self) -> None:
|
|
421
|
+
bs = ByteStream(data=b"data", meta={"name": "notes.docx"})
|
|
422
|
+
ds = _bytestream_to_document_stream(bs)
|
|
423
|
+
assert ds.name == "notes.docx"
|
|
424
|
+
|
|
425
|
+
def test_file_path_takes_priority_over_file_name(self) -> None:
|
|
426
|
+
bs = ByteStream(data=b"data", meta={"file_path": "real.pdf", "file_name": "other.pdf"})
|
|
427
|
+
ds = _bytestream_to_document_stream(bs)
|
|
428
|
+
assert ds.name == "real.pdf"
|
|
429
|
+
|
|
430
|
+
def test_file_name_takes_priority_over_name(self) -> None:
|
|
431
|
+
bs = ByteStream(data=b"data", meta={"file_name": "chosen.pdf", "name": "ignored.pdf"})
|
|
432
|
+
ds = _bytestream_to_document_stream(bs)
|
|
433
|
+
assert ds.name == "chosen.pdf"
|
|
434
|
+
|
|
435
|
+
def test_guesses_extension_from_mime_type(self) -> None:
|
|
436
|
+
mime = "application/pdf"
|
|
437
|
+
expected_ext = mimetypes.guess_extension(mime)
|
|
438
|
+
bs = ByteStream(data=b"data", meta={"file_path": "report"}, mime_type=mime)
|
|
439
|
+
ds = _bytestream_to_document_stream(bs)
|
|
440
|
+
assert ds.name == f"report{expected_ext}"
|
|
441
|
+
|
|
442
|
+
def test_keeps_extension_when_present(self) -> None:
|
|
443
|
+
# mime_type should not override an already-present extension
|
|
444
|
+
bs = ByteStream(data=b"data", meta={"file_path": "report.pdf"}, mime_type="text/plain")
|
|
445
|
+
ds = _bytestream_to_document_stream(bs)
|
|
446
|
+
assert ds.name == "report.pdf"
|
|
447
|
+
|
|
448
|
+
def test_no_meta_no_mime_type(self) -> None:
|
|
449
|
+
bs = ByteStream(data=b"data")
|
|
450
|
+
ds = _bytestream_to_document_stream(bs)
|
|
451
|
+
assert ds.name == "document"
|
|
452
|
+
|
|
453
|
+
def test_no_meta_with_mime_type(self) -> None:
|
|
454
|
+
mime = "application/pdf"
|
|
455
|
+
expected_ext = mimetypes.guess_extension(mime)
|
|
456
|
+
bs = ByteStream(data=b"data", mime_type=mime)
|
|
457
|
+
ds = _bytestream_to_document_stream(bs)
|
|
458
|
+
assert ds.name == f"document{expected_ext}"
|
|
459
|
+
|
|
460
|
+
def test_empty_meta_no_mime_type(self) -> None:
|
|
461
|
+
bs = ByteStream(data=b"data", meta={})
|
|
462
|
+
ds = _bytestream_to_document_stream(bs)
|
|
463
|
+
assert ds.name == "document"
|
|
464
|
+
|
|
465
|
+
def test_returns_document_stream_with_bytesio(self) -> None:
|
|
466
|
+
bs = ByteStream(data=b"hello", meta={"file_path": "f.pdf"})
|
|
467
|
+
ds = _bytestream_to_document_stream(bs)
|
|
468
|
+
assert isinstance(ds, DocumentStream)
|
|
469
|
+
assert isinstance(ds.stream, BytesIO)
|
|
470
|
+
|
|
471
|
+
def test_unknown_mime_type_keeps_base_name(self) -> None:
|
|
472
|
+
# mimetypes.guess_extension returns None for unknown types, so the name stays as-is.
|
|
473
|
+
assert mimetypes.guess_extension("application/x-totally-made-up-type") is None
|
|
474
|
+
bs = ByteStream(
|
|
475
|
+
data=b"data",
|
|
476
|
+
meta={"file_path": "report"},
|
|
477
|
+
mime_type="application/x-totally-made-up-type",
|
|
478
|
+
)
|
|
479
|
+
ds = _bytestream_to_document_stream(bs)
|
|
480
|
+
assert ds.name == "report"
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
class TestMetaExtractor:
|
|
484
|
+
def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
|
|
485
|
+
chunk = MagicMock()
|
|
486
|
+
chunk.export_json_dict.return_value = {"some": "dict"}
|
|
487
|
+
|
|
488
|
+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
489
|
+
|
|
490
|
+
assert result == {"dl_meta": {"some": "dict"}}
|
|
491
|
+
chunk.export_json_dict.assert_called_once_with()
|
|
492
|
+
|
|
493
|
+
def test_extract_dl_doc_meta_with_origin(self) -> None:
|
|
494
|
+
dl_doc = MagicMock()
|
|
495
|
+
dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
|
|
496
|
+
|
|
497
|
+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
|
|
498
|
+
|
|
499
|
+
assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}}
|
|
500
|
+
dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True)
|
|
501
|
+
|
|
502
|
+
def test_extract_dl_doc_meta_without_origin(self) -> None:
|
|
503
|
+
dl_doc = MagicMock()
|
|
504
|
+
dl_doc.origin = None
|
|
505
|
+
|
|
506
|
+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
|
|
507
|
+
|
|
508
|
+
assert result == {}
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def test_run_without_sources_or_paths_raises_value_error() -> None:
|
|
512
|
+
converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
|
|
513
|
+
with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
|
|
514
|
+
converter.run()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|