docling-haystack 0.4.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,16 @@
1
1
  # Changelog
2
2
 
3
+ ## [integrations/docling-v0.4.0] - 2026-05-04
4
+
5
+ ### 🚀 Features
6
+
7
+ - Add serialization/deserialization to DoclingConverter (#3267)
8
+
9
+ ### 🧪 Testing
10
+
11
+ - Docling - add a few unit tests (#3212)
12
+
13
+
3
14
  ## [integrations/docling-v0.3.0] - 2026-04-10
4
15
 
5
16
  ### 🚀 Features
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-haystack
3
- Version: 0.4.0
3
+ Version: 1.0.0
4
4
  Summary: Haystack integration for docling
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -82,7 +82,12 @@ class MetaExtractor(BaseMetaExtractor):
82
82
 
83
83
  def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]:
84
84
  """Extract chunk meta."""
85
- return {"dl_meta": chunk.export_json_dict()}
85
+ meta: dict[str, Any] = {"dl_meta": chunk.export_json_dict()}
86
+ doc_items = getattr(chunk.meta, "doc_items", [])
87
+ page_nos = {prov.page_no for item in doc_items for prov in getattr(item, "prov", [])}
88
+ if page_nos:
89
+ meta["page_number"] = min(page_nos)
90
+ return meta
86
91
 
87
92
  def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
88
93
  """Extract Docling document meta."""
@@ -97,7 +102,7 @@ class DoclingConverter:
97
102
  self,
98
103
  converter: DocumentConverter | None = None,
99
104
  convert_kwargs: dict[str, Any] | None = None,
100
- export_type: ExportType = ExportType.DOC_CHUNKS,
105
+ export_type: ExportType = ExportType.MARKDOWN,
101
106
  md_export_kwargs: dict[str, Any] | None = None,
102
107
  chunker: BaseChunker | None = None,
103
108
  meta_extractor: BaseMetaExtractor | None = None,
@@ -110,10 +115,10 @@ class DoclingConverter:
110
115
  :param convert_kwargs: Any parameters to pass to Docling conversion; if not set, a
111
116
  system default is used.
112
117
  :param export_type: The export mode to use:
113
- * `ExportType.MARKDOWN` captures each input document as a single
118
+ * `ExportType.MARKDOWN` (default) captures each input document as a single
114
119
  markdown `Document`.
115
- * `ExportType.DOC_CHUNKS` (default) first chunks each input document
116
- and then returns one `Document` per chunk.
120
+ * `ExportType.DOC_CHUNKS` first chunks each input document and then returns
121
+ one `Document` per chunk.
117
122
  * `ExportType.JSON` serializes the full Docling document to a JSON string.
118
123
  :param md_export_kwargs: Any parameters to pass to Markdown export (applicable in
119
124
  case of `ExportType.MARKDOWN`).
@@ -234,15 +239,17 @@ class DoclingConverter:
234
239
  merged_meta = source_meta
235
240
 
236
241
  if self.export_type == ExportType.DOC_CHUNKS:
237
- chunk_iter = self._chunker_instance.chunk(dl_doc=dl_doc)
238
- hs_docs = [
239
- Document(
240
- content=self._chunker_instance.contextualize(chunk=chunk),
241
- meta={**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk), **merged_meta},
242
- )
243
- for chunk in chunk_iter
244
- ]
245
- documents.extend(hs_docs)
242
+ split_idx_start = 0
243
+ for split_id, chunk in enumerate(self._chunker_instance.chunk(dl_doc=dl_doc)):
244
+ content = self._chunker_instance.contextualize(chunk=chunk)
245
+ meta = {
246
+ **self._meta_extractor_instance.extract_chunk_meta(chunk=chunk),
247
+ "split_id": split_id,
248
+ "split_idx_start": split_idx_start,
249
+ **merged_meta,
250
+ }
251
+ documents.append(Document(content=content, meta=meta))
252
+ split_idx_start += len(chunk.text)
246
253
  elif self.export_type == ExportType.MARKDOWN:
247
254
  hs_doc = Document(
248
255
  content=dl_doc.export_to_markdown(**self.md_export_kwargs),
@@ -59,7 +59,7 @@ def test_run_doc_chunks_minimal() -> None:
59
59
 
60
60
  assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents
61
61
  assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents
62
- assert {"chunk_id": "chunk-1-of-dl-doc-for-file-a.pdf"} in metas
62
+ assert any(m.get("chunk_id") == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas)
63
63
 
64
64
  # Ensure our collaborators were actually exercised.
65
65
  assert converter_mock.convert.call_count == len(paths)
@@ -152,7 +152,7 @@ def test_component_to_dict_defaults() -> None:
152
152
  "init_parameters": {
153
153
  "converter": None,
154
154
  "convert_kwargs": {},
155
- "export_type": "doc_chunks",
155
+ "export_type": "markdown",
156
156
  "md_export_kwargs": {"image_placeholder": ""},
157
157
  "chunker": None,
158
158
  "meta_extractor": None,
@@ -233,7 +233,7 @@ def test_component_from_dict_custom_params() -> None:
233
233
 
234
234
 
235
235
  def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
236
- converter = DoclingConverter(chunker=HybridChunker(merge_peers=False))
236
+ converter = DoclingConverter(export_type=ExportType.DOC_CHUNKS, chunker=HybridChunker(merge_peers=False))
237
237
 
238
238
  assert converter.to_dict() == {
239
239
  "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
@@ -484,12 +484,43 @@ class TestMetaExtractor:
484
484
  def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
485
485
  chunk = MagicMock()
486
486
  chunk.export_json_dict.return_value = {"some": "dict"}
487
+ chunk.meta.doc_items = []
487
488
 
488
489
  result = MetaExtractor().extract_chunk_meta(chunk=chunk)
489
490
 
490
491
  assert result == {"dl_meta": {"some": "dict"}}
491
492
  chunk.export_json_dict.assert_called_once_with()
492
493
 
494
+ def test_extract_chunk_meta_includes_page_number(self) -> None:
495
+ prov = MagicMock()
496
+ prov.page_no = 3
497
+ doc_item = MagicMock()
498
+ doc_item.prov = [prov]
499
+
500
+ chunk = MagicMock()
501
+ chunk.export_json_dict.return_value = {"some": "dict"}
502
+ chunk.meta.doc_items = [doc_item]
503
+
504
+ result = MetaExtractor().extract_chunk_meta(chunk=chunk)
505
+
506
+ assert result == {"dl_meta": {"some": "dict"}, "page_number": 3}
507
+
508
+ def test_extract_chunk_meta_page_number_uses_minimum(self) -> None:
509
+ prov1 = MagicMock()
510
+ prov1.page_no = 5
511
+ prov2 = MagicMock()
512
+ prov2.page_no = 3
513
+ doc_item = MagicMock()
514
+ doc_item.prov = [prov1, prov2]
515
+
516
+ chunk = MagicMock()
517
+ chunk.export_json_dict.return_value = {}
518
+ chunk.meta.doc_items = [doc_item]
519
+
520
+ result = MetaExtractor().extract_chunk_meta(chunk=chunk)
521
+
522
+ assert result["page_number"] == 3
523
+
493
524
  def test_extract_dl_doc_meta_with_origin(self) -> None:
494
525
  dl_doc = MagicMock()
495
526
  dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
@@ -512,3 +543,76 @@ def test_run_without_sources_or_paths_raises_value_error() -> None:
512
543
  converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
513
544
  with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
514
545
  converter.run()
546
+
547
+
548
+ def test_run_doc_chunks_split_id_and_split_idx_start() -> None:
549
+ converter_mock = MagicMock()
550
+ chunker_mock = MagicMock()
551
+ meta_extractor_mock = MagicMock()
552
+
553
+ converter_mock.convert.return_value = SimpleNamespace(document="dl-doc")
554
+
555
+ chunks = [
556
+ SimpleNamespace(text="hello world"),
557
+ SimpleNamespace(text="foo bar baz"),
558
+ ]
559
+ chunker_mock.chunk.return_value = chunks
560
+ chunker_mock.contextualize.side_effect = lambda chunk: f"ctx:{chunk.text}"
561
+ meta_extractor_mock.extract_chunk_meta.return_value = {}
562
+
563
+ converter = DoclingConverter(
564
+ converter=converter_mock,
565
+ export_type=ExportType.DOC_CHUNKS,
566
+ chunker=chunker_mock,
567
+ meta_extractor=meta_extractor_mock,
568
+ )
569
+
570
+ result = converter.run(sources=["doc.pdf"])
571
+ documents = result["documents"]
572
+
573
+ assert len(documents) == 2
574
+ assert documents[0].meta["split_id"] == 0
575
+ assert documents[0].meta["split_idx_start"] == 0
576
+ assert documents[1].meta["split_id"] == 1
577
+ assert documents[1].meta["split_idx_start"] == len("hello world")
578
+
579
+
580
+ def test_run_doc_chunks_split_id_resets_per_document() -> None:
581
+ converter_mock = MagicMock()
582
+ chunker_mock = MagicMock()
583
+ meta_extractor_mock = MagicMock()
584
+
585
+ converter_mock.convert.side_effect = [
586
+ SimpleNamespace(document="dl-doc-a"),
587
+ SimpleNamespace(document="dl-doc-b"),
588
+ ]
589
+ chunker_mock.chunk.side_effect = lambda dl_doc: [
590
+ SimpleNamespace(text=f"chunk-1-of-{dl_doc}"),
591
+ SimpleNamespace(text=f"chunk-2-of-{dl_doc}"),
592
+ ]
593
+ chunker_mock.contextualize.side_effect = lambda chunk: chunk.text
594
+ meta_extractor_mock.extract_chunk_meta.return_value = {}
595
+
596
+ converter = DoclingConverter(
597
+ converter=converter_mock,
598
+ export_type=ExportType.DOC_CHUNKS,
599
+ chunker=chunker_mock,
600
+ meta_extractor=meta_extractor_mock,
601
+ )
602
+
603
+ result = converter.run(sources=["a.pdf", "b.pdf"])
604
+ documents = result["documents"]
605
+
606
+ # split_id and split_idx_start reset for each source document
607
+ doc_a_chunks = documents[:2]
608
+ doc_b_chunks = documents[2:]
609
+
610
+ assert doc_a_chunks[0].meta["split_id"] == 0
611
+ assert doc_a_chunks[0].meta["split_idx_start"] == 0
612
+ assert doc_a_chunks[1].meta["split_id"] == 1
613
+ assert doc_a_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-a")
614
+
615
+ assert doc_b_chunks[0].meta["split_id"] == 0
616
+ assert doc_b_chunks[0].meta["split_idx_start"] == 0
617
+ assert doc_b_chunks[1].meta["split_id"] == 1
618
+ assert doc_b_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-b")