docling-haystack 0.3.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_haystack-1.0.0/CHANGELOG.md +31 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/PKG-INFO +1 -1
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/src/haystack_integrations/components/converters/docling/converter.py +82 -15
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/tests/test_converter.py +226 -51
- docling_haystack-0.3.0/CHANGELOG.md +0 -13
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/.gitignore +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/LICENSE.txt +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/README.md +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/pydoc/config_docusaurus.yml +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/pyproject.toml +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/src/docling_haystack/__init__.py +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/src/docling_haystack/converter.py +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/src/haystack_integrations/components/converters/docling/__init__.py +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {docling_haystack-0.3.0 → docling_haystack-1.0.0}/tests/__init__.py +0 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [integrations/docling-v0.4.0] - 2026-05-04
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- Add serialization/deserialization to DoclingConverter (#3267)
|
|
8
|
+
|
|
9
|
+
### 🧪 Testing
|
|
10
|
+
|
|
11
|
+
- Docling - add a few unit tests (#3212)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
## [integrations/docling-v0.3.0] - 2026-04-10
|
|
15
|
+
|
|
16
|
+
### 🚀 Features
|
|
17
|
+
|
|
18
|
+
- (docling) Drop temp files for ByteStream sources (#3130)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
## [integrations/docling-v0.2.0] - 2026-04-08
|
|
22
|
+
|
|
23
|
+
### 🚀 Features
|
|
24
|
+
|
|
25
|
+
- Add Docling document converter (#3066)
|
|
26
|
+
|
|
27
|
+
### 🚜 Refactor
|
|
28
|
+
|
|
29
|
+
- *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
|
|
30
|
+
|
|
31
|
+
<!-- generated by git-cliff -->
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Haystack integration for docling
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -10,14 +10,18 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any
|
|
11
11
|
|
|
12
12
|
from docling_core.types.io import DocumentStream
|
|
13
|
-
from haystack import Document, component
|
|
13
|
+
from haystack import Document, component, logging
|
|
14
14
|
from haystack.components.converters.utils import normalize_metadata
|
|
15
|
+
from haystack.core.serialization import default_from_dict, default_to_dict
|
|
15
16
|
from haystack.dataclasses import ByteStream
|
|
17
|
+
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
|
|
16
18
|
|
|
17
19
|
from docling.chunking import BaseChunk, BaseChunker, HybridChunker
|
|
18
20
|
from docling.datamodel.document import DoclingDocument
|
|
19
21
|
from docling.document_converter import DocumentConverter
|
|
20
22
|
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
|
|
23
27
|
"""
|
|
@@ -63,13 +67,27 @@ class BaseMetaExtractor(ABC):
|
|
|
63
67
|
"""Extract Docling document meta."""
|
|
64
68
|
raise NotImplementedError()
|
|
65
69
|
|
|
70
|
+
def to_dict(self) -> dict[str, Any]:
|
|
71
|
+
"""Serialize to a dictionary."""
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003
|
|
76
|
+
"""Deserialize from a dictionary."""
|
|
77
|
+
return cls()
|
|
78
|
+
|
|
66
79
|
|
|
67
80
|
class MetaExtractor(BaseMetaExtractor):
|
|
68
81
|
"""MetaExtractor."""
|
|
69
82
|
|
|
70
83
|
def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]:
|
|
71
84
|
"""Extract chunk meta."""
|
|
72
|
-
|
|
85
|
+
meta: dict[str, Any] = {"dl_meta": chunk.export_json_dict()}
|
|
86
|
+
doc_items = getattr(chunk.meta, "doc_items", [])
|
|
87
|
+
page_nos = {prov.page_no for item in doc_items for prov in getattr(item, "prov", [])}
|
|
88
|
+
if page_nos:
|
|
89
|
+
meta["page_number"] = min(page_nos)
|
|
90
|
+
return meta
|
|
73
91
|
|
|
74
92
|
def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
|
|
75
93
|
"""Extract Docling document meta."""
|
|
@@ -84,7 +102,7 @@ class DoclingConverter:
|
|
|
84
102
|
self,
|
|
85
103
|
converter: DocumentConverter | None = None,
|
|
86
104
|
convert_kwargs: dict[str, Any] | None = None,
|
|
87
|
-
export_type: ExportType = ExportType.
|
|
105
|
+
export_type: ExportType = ExportType.MARKDOWN,
|
|
88
106
|
md_export_kwargs: dict[str, Any] | None = None,
|
|
89
107
|
chunker: BaseChunker | None = None,
|
|
90
108
|
meta_extractor: BaseMetaExtractor | None = None,
|
|
@@ -97,10 +115,10 @@ class DoclingConverter:
|
|
|
97
115
|
:param convert_kwargs: Any parameters to pass to Docling conversion; if not set, a
|
|
98
116
|
system default is used.
|
|
99
117
|
:param export_type: The export mode to use:
|
|
100
|
-
* `ExportType.MARKDOWN` captures each input document as a single
|
|
118
|
+
* `ExportType.MARKDOWN` (default) captures each input document as a single
|
|
101
119
|
markdown `Document`.
|
|
102
|
-
* `ExportType.DOC_CHUNKS`
|
|
103
|
-
|
|
120
|
+
* `ExportType.DOC_CHUNKS` first chunks each input document and then returns
|
|
121
|
+
one `Document` per chunk.
|
|
104
122
|
* `ExportType.JSON` serializes the full Docling document to a JSON string.
|
|
105
123
|
:param md_export_kwargs: Any parameters to pass to Markdown export (applicable in
|
|
106
124
|
case of `ExportType.MARKDOWN`).
|
|
@@ -123,6 +141,53 @@ class DoclingConverter:
|
|
|
123
141
|
self._chunker_instance = chunker or HybridChunker()
|
|
124
142
|
self._meta_extractor_instance = meta_extractor or MetaExtractor()
|
|
125
143
|
|
|
144
|
+
def to_dict(self) -> dict[str, Any]:
|
|
145
|
+
"""Serialize this component to a dictionary."""
|
|
146
|
+
if self.converter is not None:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
|
|
149
|
+
"The component will use the default DocumentConverter when restored from the serialized form."
|
|
150
|
+
)
|
|
151
|
+
if self.chunker is not None:
|
|
152
|
+
logger.warning(
|
|
153
|
+
"DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
|
|
154
|
+
"The component will use the default chunker when restored from the serialized form."
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
meta_extractor_data = None
|
|
158
|
+
if self.meta_extractor is not None:
|
|
159
|
+
meta_extractor_data = serialize_class_instance(self.meta_extractor)
|
|
160
|
+
|
|
161
|
+
return default_to_dict(
|
|
162
|
+
self,
|
|
163
|
+
converter=None,
|
|
164
|
+
convert_kwargs=self.convert_kwargs,
|
|
165
|
+
export_type=self.export_type.value,
|
|
166
|
+
md_export_kwargs=self.md_export_kwargs,
|
|
167
|
+
chunker=None,
|
|
168
|
+
meta_extractor=meta_extractor_data,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
@classmethod
|
|
172
|
+
def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
|
|
173
|
+
"""
|
|
174
|
+
Deserialize this component from a dictionary.
|
|
175
|
+
|
|
176
|
+
The `converter` and `chunker` parameters are not serializable and are always ignored during
|
|
177
|
+
deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
|
|
178
|
+
respectively.
|
|
179
|
+
|
|
180
|
+
:param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
|
|
181
|
+
:returns: A new `DoclingConverter` instance.
|
|
182
|
+
"""
|
|
183
|
+
init_params = data.get("init_parameters", {})
|
|
184
|
+
|
|
185
|
+
meta_extractor_data = init_params.get("meta_extractor")
|
|
186
|
+
if meta_extractor_data is not None:
|
|
187
|
+
init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)
|
|
188
|
+
|
|
189
|
+
return default_from_dict(cls, data)
|
|
190
|
+
|
|
126
191
|
@component.output_types(documents=list[Document])
|
|
127
192
|
def run(
|
|
128
193
|
self,
|
|
@@ -174,15 +239,17 @@ class DoclingConverter:
|
|
|
174
239
|
merged_meta = source_meta
|
|
175
240
|
|
|
176
241
|
if self.export_type == ExportType.DOC_CHUNKS:
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
242
|
+
split_idx_start = 0
|
|
243
|
+
for split_id, chunk in enumerate(self._chunker_instance.chunk(dl_doc=dl_doc)):
|
|
244
|
+
content = self._chunker_instance.contextualize(chunk=chunk)
|
|
245
|
+
meta = {
|
|
246
|
+
**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk),
|
|
247
|
+
"split_id": split_id,
|
|
248
|
+
"split_idx_start": split_idx_start,
|
|
249
|
+
**merged_meta,
|
|
250
|
+
}
|
|
251
|
+
documents.append(Document(content=content, meta=meta))
|
|
252
|
+
split_idx_start += len(chunk.text)
|
|
186
253
|
elif self.export_type == ExportType.MARKDOWN:
|
|
187
254
|
hs_doc = Document(
|
|
188
255
|
content=dl_doc.export_to_markdown(**self.md_export_kwargs),
|
|
@@ -7,11 +7,16 @@ from typing import Any
|
|
|
7
7
|
from unittest.mock import MagicMock
|
|
8
8
|
|
|
9
9
|
import pytest
|
|
10
|
+
from docling.chunking import HybridChunker
|
|
11
|
+
from docling.document_converter import DocumentConverter
|
|
10
12
|
from docling_core.types.io import DocumentStream
|
|
11
|
-
from haystack.core.serialization import component_from_dict, component_to_dict
|
|
12
13
|
from haystack.dataclasses import ByteStream
|
|
13
14
|
|
|
14
|
-
from haystack_integrations.components.converters.docling import
|
|
15
|
+
from haystack_integrations.components.converters.docling import (
|
|
16
|
+
DoclingConverter,
|
|
17
|
+
ExportType,
|
|
18
|
+
MetaExtractor,
|
|
19
|
+
)
|
|
15
20
|
from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
|
|
16
21
|
|
|
17
22
|
|
|
@@ -54,7 +59,7 @@ def test_run_doc_chunks_minimal() -> None:
|
|
|
54
59
|
|
|
55
60
|
assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents
|
|
56
61
|
assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents
|
|
57
|
-
assert
|
|
62
|
+
assert any(m.get("chunk_id") == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas)
|
|
58
63
|
|
|
59
64
|
# Ensure our collaborators were actually exercised.
|
|
60
65
|
assert converter_mock.convert.call_count == len(paths)
|
|
@@ -130,8 +135,6 @@ def test_run_json_minimal() -> None:
|
|
|
130
135
|
|
|
131
136
|
|
|
132
137
|
def test_legacy_import_path() -> None:
|
|
133
|
-
import warnings
|
|
134
|
-
|
|
135
138
|
with warnings.catch_warnings(record=True) as caught:
|
|
136
139
|
warnings.simplefilter("always")
|
|
137
140
|
from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
|
|
@@ -142,63 +145,59 @@ def test_legacy_import_path() -> None:
|
|
|
142
145
|
)
|
|
143
146
|
|
|
144
147
|
|
|
145
|
-
def
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# convert_kwargs and md_export_kwargs were always serialized as null.
|
|
149
|
-
# Verify that such a serialized dict still deserializes correctly.
|
|
150
|
-
legacy_data = {
|
|
148
|
+
def test_component_to_dict_defaults() -> None:
|
|
149
|
+
converter = DoclingConverter()
|
|
150
|
+
assert converter.to_dict() == {
|
|
151
151
|
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
152
152
|
"init_parameters": {
|
|
153
153
|
"converter": None,
|
|
154
|
-
"convert_kwargs":
|
|
155
|
-
"export_type": "
|
|
156
|
-
"md_export_kwargs":
|
|
154
|
+
"convert_kwargs": {},
|
|
155
|
+
"export_type": "markdown",
|
|
156
|
+
"md_export_kwargs": {"image_placeholder": ""},
|
|
157
157
|
"chunker": None,
|
|
158
158
|
"meta_extractor": None,
|
|
159
159
|
},
|
|
160
160
|
}
|
|
161
|
-
restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")
|
|
162
|
-
|
|
163
|
-
assert restored.convert_kwargs == {}
|
|
164
|
-
assert restored.md_export_kwargs == {"image_placeholder": ""}
|
|
165
|
-
assert restored.export_type == ExportType.DOC_CHUNKS
|
|
166
|
-
assert restored.converter is None
|
|
167
|
-
assert restored.chunker is None
|
|
168
|
-
assert restored.meta_extractor is None
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def test_component_to_dict_defaults() -> None:
|
|
172
|
-
converter = DoclingConverter()
|
|
173
|
-
data = component_to_dict(converter, "docling_converter")
|
|
174
|
-
|
|
175
|
-
init_params = data["init_parameters"]
|
|
176
|
-
assert init_params["converter"] is None
|
|
177
|
-
assert init_params["convert_kwargs"] == {}
|
|
178
|
-
assert init_params["export_type"] == ExportType.DOC_CHUNKS
|
|
179
|
-
assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
|
|
180
|
-
assert init_params["chunker"] is None
|
|
181
|
-
assert init_params["meta_extractor"] is None
|
|
182
161
|
|
|
183
162
|
|
|
184
163
|
def test_component_to_dict_custom_params() -> None:
|
|
185
164
|
converter = DoclingConverter(
|
|
165
|
+
converter=DocumentConverter(),
|
|
186
166
|
convert_kwargs={"raises_on_error": False},
|
|
187
167
|
export_type=ExportType.MARKDOWN,
|
|
188
168
|
md_export_kwargs={"image_placeholder": "[img]"},
|
|
169
|
+
meta_extractor=MetaExtractor(),
|
|
189
170
|
)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
171
|
+
assert converter.to_dict() == {
|
|
172
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
173
|
+
"init_parameters": {
|
|
174
|
+
"converter": None,
|
|
175
|
+
"convert_kwargs": {"raises_on_error": False},
|
|
176
|
+
"export_type": "markdown",
|
|
177
|
+
"md_export_kwargs": {"image_placeholder": "[img]"},
|
|
178
|
+
"chunker": None,
|
|
179
|
+
"meta_extractor": {
|
|
180
|
+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
|
|
181
|
+
"data": {},
|
|
182
|
+
},
|
|
183
|
+
},
|
|
184
|
+
}
|
|
196
185
|
|
|
197
186
|
|
|
198
187
|
def test_component_from_dict_defaults() -> None:
|
|
199
|
-
|
|
200
|
-
data =
|
|
201
|
-
|
|
188
|
+
# null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
|
|
189
|
+
data = {
|
|
190
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
191
|
+
"init_parameters": {
|
|
192
|
+
"converter": None,
|
|
193
|
+
"convert_kwargs": None,
|
|
194
|
+
"export_type": "doc_chunks",
|
|
195
|
+
"md_export_kwargs": None,
|
|
196
|
+
"chunker": None,
|
|
197
|
+
"meta_extractor": None,
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
restored = DoclingConverter.from_dict(data)
|
|
202
201
|
|
|
203
202
|
assert restored.converter is None
|
|
204
203
|
assert restored.convert_kwargs == {}
|
|
@@ -209,17 +208,44 @@ def test_component_from_dict_defaults() -> None:
|
|
|
209
208
|
|
|
210
209
|
|
|
211
210
|
def test_component_from_dict_custom_params() -> None:
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
211
|
+
data = {
|
|
212
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
213
|
+
"init_parameters": {
|
|
214
|
+
"converter": None,
|
|
215
|
+
"convert_kwargs": {"raises_on_error": False},
|
|
216
|
+
"export_type": "json",
|
|
217
|
+
"md_export_kwargs": {"image_placeholder": "[img]"},
|
|
218
|
+
"chunker": None,
|
|
219
|
+
"meta_extractor": {
|
|
220
|
+
"type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
|
|
221
|
+
"data": {},
|
|
222
|
+
},
|
|
223
|
+
},
|
|
224
|
+
}
|
|
225
|
+
restored = DoclingConverter.from_dict(data)
|
|
219
226
|
|
|
227
|
+
assert restored.converter is None
|
|
220
228
|
assert restored.convert_kwargs == {"raises_on_error": False}
|
|
221
229
|
assert restored.export_type == ExportType.JSON
|
|
222
230
|
assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
|
|
231
|
+
assert restored.chunker is None
|
|
232
|
+
assert isinstance(restored.meta_extractor, MetaExtractor)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
|
|
236
|
+
converter = DoclingConverter(export_type=ExportType.DOC_CHUNKS, chunker=HybridChunker(merge_peers=False))
|
|
237
|
+
|
|
238
|
+
assert converter.to_dict() == {
|
|
239
|
+
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
240
|
+
"init_parameters": {
|
|
241
|
+
"converter": None,
|
|
242
|
+
"convert_kwargs": {},
|
|
243
|
+
"export_type": "doc_chunks",
|
|
244
|
+
"md_export_kwargs": {"image_placeholder": ""},
|
|
245
|
+
"chunker": None,
|
|
246
|
+
"meta_extractor": None,
|
|
247
|
+
},
|
|
248
|
+
}
|
|
223
249
|
|
|
224
250
|
|
|
225
251
|
def test_run_with_sources_parameter() -> None:
|
|
@@ -441,3 +467,152 @@ class TestBytestreamToDocumentStream:
|
|
|
441
467
|
ds = _bytestream_to_document_stream(bs)
|
|
442
468
|
assert isinstance(ds, DocumentStream)
|
|
443
469
|
assert isinstance(ds.stream, BytesIO)
|
|
470
|
+
|
|
471
|
+
def test_unknown_mime_type_keeps_base_name(self) -> None:
|
|
472
|
+
# mimetypes.guess_extension returns None for unknown types, so the name stays as-is.
|
|
473
|
+
assert mimetypes.guess_extension("application/x-totally-made-up-type") is None
|
|
474
|
+
bs = ByteStream(
|
|
475
|
+
data=b"data",
|
|
476
|
+
meta={"file_path": "report"},
|
|
477
|
+
mime_type="application/x-totally-made-up-type",
|
|
478
|
+
)
|
|
479
|
+
ds = _bytestream_to_document_stream(bs)
|
|
480
|
+
assert ds.name == "report"
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
class TestMetaExtractor:
|
|
484
|
+
def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
|
|
485
|
+
chunk = MagicMock()
|
|
486
|
+
chunk.export_json_dict.return_value = {"some": "dict"}
|
|
487
|
+
chunk.meta.doc_items = []
|
|
488
|
+
|
|
489
|
+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
490
|
+
|
|
491
|
+
assert result == {"dl_meta": {"some": "dict"}}
|
|
492
|
+
chunk.export_json_dict.assert_called_once_with()
|
|
493
|
+
|
|
494
|
+
def test_extract_chunk_meta_includes_page_number(self) -> None:
|
|
495
|
+
prov = MagicMock()
|
|
496
|
+
prov.page_no = 3
|
|
497
|
+
doc_item = MagicMock()
|
|
498
|
+
doc_item.prov = [prov]
|
|
499
|
+
|
|
500
|
+
chunk = MagicMock()
|
|
501
|
+
chunk.export_json_dict.return_value = {"some": "dict"}
|
|
502
|
+
chunk.meta.doc_items = [doc_item]
|
|
503
|
+
|
|
504
|
+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
505
|
+
|
|
506
|
+
assert result == {"dl_meta": {"some": "dict"}, "page_number": 3}
|
|
507
|
+
|
|
508
|
+
def test_extract_chunk_meta_page_number_uses_minimum(self) -> None:
|
|
509
|
+
prov1 = MagicMock()
|
|
510
|
+
prov1.page_no = 5
|
|
511
|
+
prov2 = MagicMock()
|
|
512
|
+
prov2.page_no = 3
|
|
513
|
+
doc_item = MagicMock()
|
|
514
|
+
doc_item.prov = [prov1, prov2]
|
|
515
|
+
|
|
516
|
+
chunk = MagicMock()
|
|
517
|
+
chunk.export_json_dict.return_value = {}
|
|
518
|
+
chunk.meta.doc_items = [doc_item]
|
|
519
|
+
|
|
520
|
+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
521
|
+
|
|
522
|
+
assert result["page_number"] == 3
|
|
523
|
+
|
|
524
|
+
def test_extract_dl_doc_meta_with_origin(self) -> None:
|
|
525
|
+
dl_doc = MagicMock()
|
|
526
|
+
dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
|
|
527
|
+
|
|
528
|
+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
|
|
529
|
+
|
|
530
|
+
assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}}
|
|
531
|
+
dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True)
|
|
532
|
+
|
|
533
|
+
def test_extract_dl_doc_meta_without_origin(self) -> None:
|
|
534
|
+
dl_doc = MagicMock()
|
|
535
|
+
dl_doc.origin = None
|
|
536
|
+
|
|
537
|
+
result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
|
|
538
|
+
|
|
539
|
+
assert result == {}
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def test_run_without_sources_or_paths_raises_value_error() -> None:
|
|
543
|
+
converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
|
|
544
|
+
with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
|
|
545
|
+
converter.run()
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def test_run_doc_chunks_split_id_and_split_idx_start() -> None:
|
|
549
|
+
converter_mock = MagicMock()
|
|
550
|
+
chunker_mock = MagicMock()
|
|
551
|
+
meta_extractor_mock = MagicMock()
|
|
552
|
+
|
|
553
|
+
converter_mock.convert.return_value = SimpleNamespace(document="dl-doc")
|
|
554
|
+
|
|
555
|
+
chunks = [
|
|
556
|
+
SimpleNamespace(text="hello world"),
|
|
557
|
+
SimpleNamespace(text="foo bar baz"),
|
|
558
|
+
]
|
|
559
|
+
chunker_mock.chunk.return_value = chunks
|
|
560
|
+
chunker_mock.contextualize.side_effect = lambda chunk: f"ctx:{chunk.text}"
|
|
561
|
+
meta_extractor_mock.extract_chunk_meta.return_value = {}
|
|
562
|
+
|
|
563
|
+
converter = DoclingConverter(
|
|
564
|
+
converter=converter_mock,
|
|
565
|
+
export_type=ExportType.DOC_CHUNKS,
|
|
566
|
+
chunker=chunker_mock,
|
|
567
|
+
meta_extractor=meta_extractor_mock,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
result = converter.run(sources=["doc.pdf"])
|
|
571
|
+
documents = result["documents"]
|
|
572
|
+
|
|
573
|
+
assert len(documents) == 2
|
|
574
|
+
assert documents[0].meta["split_id"] == 0
|
|
575
|
+
assert documents[0].meta["split_idx_start"] == 0
|
|
576
|
+
assert documents[1].meta["split_id"] == 1
|
|
577
|
+
assert documents[1].meta["split_idx_start"] == len("hello world")
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def test_run_doc_chunks_split_id_resets_per_document() -> None:
|
|
581
|
+
converter_mock = MagicMock()
|
|
582
|
+
chunker_mock = MagicMock()
|
|
583
|
+
meta_extractor_mock = MagicMock()
|
|
584
|
+
|
|
585
|
+
converter_mock.convert.side_effect = [
|
|
586
|
+
SimpleNamespace(document="dl-doc-a"),
|
|
587
|
+
SimpleNamespace(document="dl-doc-b"),
|
|
588
|
+
]
|
|
589
|
+
chunker_mock.chunk.side_effect = lambda dl_doc: [
|
|
590
|
+
SimpleNamespace(text=f"chunk-1-of-{dl_doc}"),
|
|
591
|
+
SimpleNamespace(text=f"chunk-2-of-{dl_doc}"),
|
|
592
|
+
]
|
|
593
|
+
chunker_mock.contextualize.side_effect = lambda chunk: chunk.text
|
|
594
|
+
meta_extractor_mock.extract_chunk_meta.return_value = {}
|
|
595
|
+
|
|
596
|
+
converter = DoclingConverter(
|
|
597
|
+
converter=converter_mock,
|
|
598
|
+
export_type=ExportType.DOC_CHUNKS,
|
|
599
|
+
chunker=chunker_mock,
|
|
600
|
+
meta_extractor=meta_extractor_mock,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
result = converter.run(sources=["a.pdf", "b.pdf"])
|
|
604
|
+
documents = result["documents"]
|
|
605
|
+
|
|
606
|
+
# split_id and split_idx_start reset for each source document
|
|
607
|
+
doc_a_chunks = documents[:2]
|
|
608
|
+
doc_b_chunks = documents[2:]
|
|
609
|
+
|
|
610
|
+
assert doc_a_chunks[0].meta["split_id"] == 0
|
|
611
|
+
assert doc_a_chunks[0].meta["split_idx_start"] == 0
|
|
612
|
+
assert doc_a_chunks[1].meta["split_id"] == 1
|
|
613
|
+
assert doc_a_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-a")
|
|
614
|
+
|
|
615
|
+
assert doc_b_chunks[0].meta["split_id"] == 0
|
|
616
|
+
assert doc_b_chunks[0].meta["split_idx_start"] == 0
|
|
617
|
+
assert doc_b_chunks[1].meta["split_id"] == 1
|
|
618
|
+
assert doc_b_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-b")
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# Changelog
|
|
2
|
-
|
|
3
|
-
## [integrations/docling-v0.2.0] - 2026-04-08
|
|
4
|
-
|
|
5
|
-
### 🚀 Features
|
|
6
|
-
|
|
7
|
-
- Add Docling document converter (#3066)
|
|
8
|
-
|
|
9
|
-
### 🚜 Refactor
|
|
10
|
-
|
|
11
|
-
- *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
|
|
12
|
-
|
|
13
|
-
<!-- generated by git-cliff -->
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|