docling-haystack 0.4.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/CHANGELOG.md +11 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/PKG-INFO +1 -1
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/src/haystack_integrations/components/converters/docling/converter.py +21 -14
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/tests/test_converter.py +107 -3
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/.gitignore +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/LICENSE.txt +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/README.md +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/pydoc/config_docusaurus.yml +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/pyproject.toml +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/src/docling_haystack/__init__.py +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/src/docling_haystack/converter.py +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/src/haystack_integrations/components/converters/docling/__init__.py +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/src/haystack_integrations/components/converters/py.typed +0 -0
- {docling_haystack-0.4.0 → docling_haystack-1.0.0}/tests/__init__.py +0 -0
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/docling-v0.4.0] - 2026-05-04
|
|
4
|
+
|
|
5
|
+
### 🚀 Features
|
|
6
|
+
|
|
7
|
+
- Add serialization/deserialization to DoclingConverter (#3267)
|
|
8
|
+
|
|
9
|
+
### 🧪 Testing
|
|
10
|
+
|
|
11
|
+
- Docling - add a few unit tests (#3212)
|
|
12
|
+
|
|
13
|
+
|
|
3
14
|
## [integrations/docling-v0.3.0] - 2026-04-10
|
|
4
15
|
|
|
5
16
|
### 🚀 Features
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling-haystack
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: Haystack integration for docling
|
|
5
5
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
@@ -82,7 +82,12 @@ class MetaExtractor(BaseMetaExtractor):
|
|
|
82
82
|
|
|
83
83
|
def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]:
|
|
84
84
|
"""Extract chunk meta."""
|
|
85
|
-
|
|
85
|
+
meta: dict[str, Any] = {"dl_meta": chunk.export_json_dict()}
|
|
86
|
+
doc_items = getattr(chunk.meta, "doc_items", [])
|
|
87
|
+
page_nos = {prov.page_no for item in doc_items for prov in getattr(item, "prov", [])}
|
|
88
|
+
if page_nos:
|
|
89
|
+
meta["page_number"] = min(page_nos)
|
|
90
|
+
return meta
|
|
86
91
|
|
|
87
92
|
def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
|
|
88
93
|
"""Extract Docling document meta."""
|
|
@@ -97,7 +102,7 @@ class DoclingConverter:
|
|
|
97
102
|
self,
|
|
98
103
|
converter: DocumentConverter | None = None,
|
|
99
104
|
convert_kwargs: dict[str, Any] | None = None,
|
|
100
|
-
export_type: ExportType = ExportType.
|
|
105
|
+
export_type: ExportType = ExportType.MARKDOWN,
|
|
101
106
|
md_export_kwargs: dict[str, Any] | None = None,
|
|
102
107
|
chunker: BaseChunker | None = None,
|
|
103
108
|
meta_extractor: BaseMetaExtractor | None = None,
|
|
@@ -110,10 +115,10 @@ class DoclingConverter:
|
|
|
110
115
|
:param convert_kwargs: Any parameters to pass to Docling conversion; if not set, a
|
|
111
116
|
system default is used.
|
|
112
117
|
:param export_type: The export mode to use:
|
|
113
|
-
* `ExportType.MARKDOWN` captures each input document as a single
|
|
118
|
+
* `ExportType.MARKDOWN` (default) captures each input document as a single
|
|
114
119
|
markdown `Document`.
|
|
115
|
-
* `ExportType.DOC_CHUNKS`
|
|
116
|
-
|
|
120
|
+
* `ExportType.DOC_CHUNKS` first chunks each input document and then returns
|
|
121
|
+
one `Document` per chunk.
|
|
117
122
|
* `ExportType.JSON` serializes the full Docling document to a JSON string.
|
|
118
123
|
:param md_export_kwargs: Any parameters to pass to Markdown export (applicable in
|
|
119
124
|
case of `ExportType.MARKDOWN`).
|
|
@@ -234,15 +239,17 @@ class DoclingConverter:
|
|
|
234
239
|
merged_meta = source_meta
|
|
235
240
|
|
|
236
241
|
if self.export_type == ExportType.DOC_CHUNKS:
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
242
|
+
split_idx_start = 0
|
|
243
|
+
for split_id, chunk in enumerate(self._chunker_instance.chunk(dl_doc=dl_doc)):
|
|
244
|
+
content = self._chunker_instance.contextualize(chunk=chunk)
|
|
245
|
+
meta = {
|
|
246
|
+
**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk),
|
|
247
|
+
"split_id": split_id,
|
|
248
|
+
"split_idx_start": split_idx_start,
|
|
249
|
+
**merged_meta,
|
|
250
|
+
}
|
|
251
|
+
documents.append(Document(content=content, meta=meta))
|
|
252
|
+
split_idx_start += len(chunk.text)
|
|
246
253
|
elif self.export_type == ExportType.MARKDOWN:
|
|
247
254
|
hs_doc = Document(
|
|
248
255
|
content=dl_doc.export_to_markdown(**self.md_export_kwargs),
|
|
@@ -59,7 +59,7 @@ def test_run_doc_chunks_minimal() -> None:
|
|
|
59
59
|
|
|
60
60
|
assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents
|
|
61
61
|
assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents
|
|
62
|
-
assert
|
|
62
|
+
assert any(m.get("chunk_id") == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas)
|
|
63
63
|
|
|
64
64
|
# Ensure our collaborators were actually exercised.
|
|
65
65
|
assert converter_mock.convert.call_count == len(paths)
|
|
@@ -152,7 +152,7 @@ def test_component_to_dict_defaults() -> None:
|
|
|
152
152
|
"init_parameters": {
|
|
153
153
|
"converter": None,
|
|
154
154
|
"convert_kwargs": {},
|
|
155
|
-
"export_type": "
|
|
155
|
+
"export_type": "markdown",
|
|
156
156
|
"md_export_kwargs": {"image_placeholder": ""},
|
|
157
157
|
"chunker": None,
|
|
158
158
|
"meta_extractor": None,
|
|
@@ -233,7 +233,7 @@ def test_component_from_dict_custom_params() -> None:
|
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
|
|
236
|
-
converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
|
|
236
|
+
converter = DoclingConverter(export_type=ExportType.DOC_CHUNKS, chunker=HybridChunker(merge_peers=False))
|
|
237
237
|
|
|
238
238
|
assert converter.to_dict() == {
|
|
239
239
|
"type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
|
|
@@ -484,12 +484,43 @@ class TestMetaExtractor:
|
|
|
484
484
|
def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
|
|
485
485
|
chunk = MagicMock()
|
|
486
486
|
chunk.export_json_dict.return_value = {"some": "dict"}
|
|
487
|
+
chunk.meta.doc_items = []
|
|
487
488
|
|
|
488
489
|
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
489
490
|
|
|
490
491
|
assert result == {"dl_meta": {"some": "dict"}}
|
|
491
492
|
chunk.export_json_dict.assert_called_once_with()
|
|
492
493
|
|
|
494
|
+
def test_extract_chunk_meta_includes_page_number(self) -> None:
|
|
495
|
+
prov = MagicMock()
|
|
496
|
+
prov.page_no = 3
|
|
497
|
+
doc_item = MagicMock()
|
|
498
|
+
doc_item.prov = [prov]
|
|
499
|
+
|
|
500
|
+
chunk = MagicMock()
|
|
501
|
+
chunk.export_json_dict.return_value = {"some": "dict"}
|
|
502
|
+
chunk.meta.doc_items = [doc_item]
|
|
503
|
+
|
|
504
|
+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
505
|
+
|
|
506
|
+
assert result == {"dl_meta": {"some": "dict"}, "page_number": 3}
|
|
507
|
+
|
|
508
|
+
def test_extract_chunk_meta_page_number_uses_minimum(self) -> None:
|
|
509
|
+
prov1 = MagicMock()
|
|
510
|
+
prov1.page_no = 5
|
|
511
|
+
prov2 = MagicMock()
|
|
512
|
+
prov2.page_no = 3
|
|
513
|
+
doc_item = MagicMock()
|
|
514
|
+
doc_item.prov = [prov1, prov2]
|
|
515
|
+
|
|
516
|
+
chunk = MagicMock()
|
|
517
|
+
chunk.export_json_dict.return_value = {}
|
|
518
|
+
chunk.meta.doc_items = [doc_item]
|
|
519
|
+
|
|
520
|
+
result = MetaExtractor().extract_chunk_meta(chunk=chunk)
|
|
521
|
+
|
|
522
|
+
assert result["page_number"] == 3
|
|
523
|
+
|
|
493
524
|
def test_extract_dl_doc_meta_with_origin(self) -> None:
|
|
494
525
|
dl_doc = MagicMock()
|
|
495
526
|
dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
|
|
@@ -512,3 +543,76 @@ def test_run_without_sources_or_paths_raises_value_error() -> None:
|
|
|
512
543
|
converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
|
|
513
544
|
with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
|
|
514
545
|
converter.run()
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def test_run_doc_chunks_split_id_and_split_idx_start() -> None:
|
|
549
|
+
converter_mock = MagicMock()
|
|
550
|
+
chunker_mock = MagicMock()
|
|
551
|
+
meta_extractor_mock = MagicMock()
|
|
552
|
+
|
|
553
|
+
converter_mock.convert.return_value = SimpleNamespace(document="dl-doc")
|
|
554
|
+
|
|
555
|
+
chunks = [
|
|
556
|
+
SimpleNamespace(text="hello world"),
|
|
557
|
+
SimpleNamespace(text="foo bar baz"),
|
|
558
|
+
]
|
|
559
|
+
chunker_mock.chunk.return_value = chunks
|
|
560
|
+
chunker_mock.contextualize.side_effect = lambda chunk: f"ctx:{chunk.text}"
|
|
561
|
+
meta_extractor_mock.extract_chunk_meta.return_value = {}
|
|
562
|
+
|
|
563
|
+
converter = DoclingConverter(
|
|
564
|
+
converter=converter_mock,
|
|
565
|
+
export_type=ExportType.DOC_CHUNKS,
|
|
566
|
+
chunker=chunker_mock,
|
|
567
|
+
meta_extractor=meta_extractor_mock,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
result = converter.run(sources=["doc.pdf"])
|
|
571
|
+
documents = result["documents"]
|
|
572
|
+
|
|
573
|
+
assert len(documents) == 2
|
|
574
|
+
assert documents[0].meta["split_id"] == 0
|
|
575
|
+
assert documents[0].meta["split_idx_start"] == 0
|
|
576
|
+
assert documents[1].meta["split_id"] == 1
|
|
577
|
+
assert documents[1].meta["split_idx_start"] == len("hello world")
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def test_run_doc_chunks_split_id_resets_per_document() -> None:
|
|
581
|
+
converter_mock = MagicMock()
|
|
582
|
+
chunker_mock = MagicMock()
|
|
583
|
+
meta_extractor_mock = MagicMock()
|
|
584
|
+
|
|
585
|
+
converter_mock.convert.side_effect = [
|
|
586
|
+
SimpleNamespace(document="dl-doc-a"),
|
|
587
|
+
SimpleNamespace(document="dl-doc-b"),
|
|
588
|
+
]
|
|
589
|
+
chunker_mock.chunk.side_effect = lambda dl_doc: [
|
|
590
|
+
SimpleNamespace(text=f"chunk-1-of-{dl_doc}"),
|
|
591
|
+
SimpleNamespace(text=f"chunk-2-of-{dl_doc}"),
|
|
592
|
+
]
|
|
593
|
+
chunker_mock.contextualize.side_effect = lambda chunk: chunk.text
|
|
594
|
+
meta_extractor_mock.extract_chunk_meta.return_value = {}
|
|
595
|
+
|
|
596
|
+
converter = DoclingConverter(
|
|
597
|
+
converter=converter_mock,
|
|
598
|
+
export_type=ExportType.DOC_CHUNKS,
|
|
599
|
+
chunker=chunker_mock,
|
|
600
|
+
meta_extractor=meta_extractor_mock,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
result = converter.run(sources=["a.pdf", "b.pdf"])
|
|
604
|
+
documents = result["documents"]
|
|
605
|
+
|
|
606
|
+
# split_id and split_idx_start reset for each source document
|
|
607
|
+
doc_a_chunks = documents[:2]
|
|
608
|
+
doc_b_chunks = documents[2:]
|
|
609
|
+
|
|
610
|
+
assert doc_a_chunks[0].meta["split_id"] == 0
|
|
611
|
+
assert doc_a_chunks[0].meta["split_idx_start"] == 0
|
|
612
|
+
assert doc_a_chunks[1].meta["split_id"] == 1
|
|
613
|
+
assert doc_a_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-a")
|
|
614
|
+
|
|
615
|
+
assert doc_b_chunks[0].meta["split_id"] == 0
|
|
616
|
+
assert doc_b_chunks[0].meta["split_idx_start"] == 0
|
|
617
|
+
assert doc_b_chunks[1].meta["split_id"] == 1
|
|
618
|
+
assert doc_b_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-b")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|