docling-haystack 0.3.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ # Changelog
2
+
3
+ ## [integrations/docling-v0.4.0] - 2026-05-04
4
+
5
+ ### 🚀 Features
6
+
7
+ - Add serialization/deserialization to DoclingConverter (#3267)
8
+
9
+ ### 🧪 Testing
10
+
11
+ - Docling - add a few unit tests (#3212)
12
+
13
+
14
+ ## [integrations/docling-v0.3.0] - 2026-04-10
15
+
16
+ ### 🚀 Features
17
+
18
+ - (docling) Drop temp files for ByteStream sources (#3130)
19
+
20
+
21
+ ## [integrations/docling-v0.2.0] - 2026-04-08
22
+
23
+ ### 🚀 Features
24
+
25
+ - Add Docling document converter (#3066)
26
+
27
+ ### 🚜 Refactor
28
+
29
+ - *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
30
+
31
+ <!-- generated by git-cliff -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling-haystack
3
- Version: 0.3.0
3
+ Version: 1.0.0
4
4
  Summary: Haystack integration for docling
5
5
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/docling#readme
6
6
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
@@ -10,14 +10,18 @@ from pathlib import Path
10
10
  from typing import Any
11
11
 
12
12
  from docling_core.types.io import DocumentStream
13
- from haystack import Document, component
13
+ from haystack import Document, component, logging
14
14
  from haystack.components.converters.utils import normalize_metadata
15
+ from haystack.core.serialization import default_from_dict, default_to_dict
15
16
  from haystack.dataclasses import ByteStream
17
+ from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance
16
18
 
17
19
  from docling.chunking import BaseChunk, BaseChunker, HybridChunker
18
20
  from docling.datamodel.document import DoclingDocument
19
21
  from docling.document_converter import DocumentConverter
20
22
 
23
+ logger = logging.getLogger(__name__)
24
+
21
25
 
22
26
  def _bytestream_to_document_stream(source: ByteStream) -> DocumentStream:
23
27
  """
@@ -63,13 +67,27 @@ class BaseMetaExtractor(ABC):
63
67
  """Extract Docling document meta."""
64
68
  raise NotImplementedError()
65
69
 
70
+ def to_dict(self) -> dict[str, Any]:
71
+ """Serialize to a dictionary."""
72
+ return {}
73
+
74
+ @classmethod
75
+ def from_dict(cls, data: dict[str, Any]) -> "BaseMetaExtractor": # noqa: ARG003
76
+ """Deserialize from a dictionary."""
77
+ return cls()
78
+
66
79
 
67
80
  class MetaExtractor(BaseMetaExtractor):
68
81
  """MetaExtractor."""
69
82
 
70
83
  def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]:
71
84
  """Extract chunk meta."""
72
- return {"dl_meta": chunk.export_json_dict()}
85
+ meta: dict[str, Any] = {"dl_meta": chunk.export_json_dict()}
86
+ doc_items = getattr(chunk.meta, "doc_items", [])
87
+ page_nos = {prov.page_no for item in doc_items for prov in getattr(item, "prov", [])}
88
+ if page_nos:
89
+ meta["page_number"] = min(page_nos)
90
+ return meta
73
91
 
74
92
  def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
75
93
  """Extract Docling document meta."""
@@ -84,7 +102,7 @@ class DoclingConverter:
84
102
  self,
85
103
  converter: DocumentConverter | None = None,
86
104
  convert_kwargs: dict[str, Any] | None = None,
87
- export_type: ExportType = ExportType.DOC_CHUNKS,
105
+ export_type: ExportType = ExportType.MARKDOWN,
88
106
  md_export_kwargs: dict[str, Any] | None = None,
89
107
  chunker: BaseChunker | None = None,
90
108
  meta_extractor: BaseMetaExtractor | None = None,
@@ -97,10 +115,10 @@ class DoclingConverter:
97
115
  :param convert_kwargs: Any parameters to pass to Docling conversion; if not set, a
98
116
  system default is used.
99
117
  :param export_type: The export mode to use:
100
- * `ExportType.MARKDOWN` captures each input document as a single
118
+ * `ExportType.MARKDOWN` (default) captures each input document as a single
101
119
  markdown `Document`.
102
- * `ExportType.DOC_CHUNKS` (default) first chunks each input document
103
- and then returns one `Document` per chunk.
120
+ * `ExportType.DOC_CHUNKS` first chunks each input document and then returns
121
+ one `Document` per chunk.
104
122
  * `ExportType.JSON` serializes the full Docling document to a JSON string.
105
123
  :param md_export_kwargs: Any parameters to pass to Markdown export (applicable in
106
124
  case of `ExportType.MARKDOWN`).
@@ -123,6 +141,53 @@ class DoclingConverter:
123
141
  self._chunker_instance = chunker or HybridChunker()
124
142
  self._meta_extractor_instance = meta_extractor or MetaExtractor()
125
143
 
144
+ def to_dict(self) -> dict[str, Any]:
145
+ """Serialize this component to a dictionary."""
146
+ if self.converter is not None:
147
+ logger.warning(
148
+ "DoclingConverter.to_dict: the 'converter' parameter cannot be serialized and will be dropped. "
149
+ "The component will use the default DocumentConverter when restored from the serialized form."
150
+ )
151
+ if self.chunker is not None:
152
+ logger.warning(
153
+ "DoclingConverter.to_dict: the 'chunker' parameter cannot be serialized and will be dropped. "
154
+ "The component will use the default chunker when restored from the serialized form."
155
+ )
156
+
157
+ meta_extractor_data = None
158
+ if self.meta_extractor is not None:
159
+ meta_extractor_data = serialize_class_instance(self.meta_extractor)
160
+
161
+ return default_to_dict(
162
+ self,
163
+ converter=None,
164
+ convert_kwargs=self.convert_kwargs,
165
+ export_type=self.export_type.value,
166
+ md_export_kwargs=self.md_export_kwargs,
167
+ chunker=None,
168
+ meta_extractor=meta_extractor_data,
169
+ )
170
+
171
+ @classmethod
172
+ def from_dict(cls, data: dict[str, Any]) -> "DoclingConverter":
173
+ """
174
+ Deserialize this component from a dictionary.
175
+
176
+ The `converter` and `chunker` parameters are not serializable and are always ignored during
177
+ deserialization; the restored instance will use the default `DocumentConverter` and `HybridChunker`
178
+ respectively.
179
+
180
+ :param data: Dictionary with keys `type` and `init_parameters`, as produced by `to_dict`.
181
+ :returns: A new `DoclingConverter` instance.
182
+ """
183
+ init_params = data.get("init_parameters", {})
184
+
185
+ meta_extractor_data = init_params.get("meta_extractor")
186
+ if meta_extractor_data is not None:
187
+ init_params["meta_extractor"] = deserialize_class_instance(meta_extractor_data)
188
+
189
+ return default_from_dict(cls, data)
190
+
126
191
  @component.output_types(documents=list[Document])
127
192
  def run(
128
193
  self,
@@ -174,15 +239,17 @@ class DoclingConverter:
174
239
  merged_meta = source_meta
175
240
 
176
241
  if self.export_type == ExportType.DOC_CHUNKS:
177
- chunk_iter = self._chunker_instance.chunk(dl_doc=dl_doc)
178
- hs_docs = [
179
- Document(
180
- content=self._chunker_instance.contextualize(chunk=chunk),
181
- meta={**self._meta_extractor_instance.extract_chunk_meta(chunk=chunk), **merged_meta},
182
- )
183
- for chunk in chunk_iter
184
- ]
185
- documents.extend(hs_docs)
242
+ split_idx_start = 0
243
+ for split_id, chunk in enumerate(self._chunker_instance.chunk(dl_doc=dl_doc)):
244
+ content = self._chunker_instance.contextualize(chunk=chunk)
245
+ meta = {
246
+ **self._meta_extractor_instance.extract_chunk_meta(chunk=chunk),
247
+ "split_id": split_id,
248
+ "split_idx_start": split_idx_start,
249
+ **merged_meta,
250
+ }
251
+ documents.append(Document(content=content, meta=meta))
252
+ split_idx_start += len(chunk.text)
186
253
  elif self.export_type == ExportType.MARKDOWN:
187
254
  hs_doc = Document(
188
255
  content=dl_doc.export_to_markdown(**self.md_export_kwargs),
@@ -7,11 +7,16 @@ from typing import Any
7
7
  from unittest.mock import MagicMock
8
8
 
9
9
  import pytest
10
+ from docling.chunking import HybridChunker
11
+ from docling.document_converter import DocumentConverter
10
12
  from docling_core.types.io import DocumentStream
11
- from haystack.core.serialization import component_from_dict, component_to_dict
12
13
  from haystack.dataclasses import ByteStream
13
14
 
14
- from haystack_integrations.components.converters.docling import DoclingConverter, ExportType
15
+ from haystack_integrations.components.converters.docling import (
16
+ DoclingConverter,
17
+ ExportType,
18
+ MetaExtractor,
19
+ )
15
20
  from haystack_integrations.components.converters.docling.converter import _bytestream_to_document_stream
16
21
 
17
22
 
@@ -54,7 +59,7 @@ def test_run_doc_chunks_minimal() -> None:
54
59
 
55
60
  assert "contextualized-chunk-1-of-dl-doc-for-file-a.pdf" in contents
56
61
  assert "contextualized-chunk-2-of-dl-doc-for-file-a.pdf" in contents
57
- assert {"chunk_id": "chunk-1-of-dl-doc-for-file-a.pdf"} in metas
62
+ assert any(m.get("chunk_id") == "chunk-1-of-dl-doc-for-file-a.pdf" for m in metas)
58
63
 
59
64
  # Ensure our collaborators were actually exercised.
60
65
  assert converter_mock.convert.call_count == len(paths)
@@ -130,8 +135,6 @@ def test_run_json_minimal() -> None:
130
135
 
131
136
 
132
137
  def test_legacy_import_path() -> None:
133
- import warnings
134
-
135
138
  with warnings.catch_warnings(record=True) as caught:
136
139
  warnings.simplefilter("always")
137
140
  from docling_haystack.converter import DoclingConverter as LegacyDoclingConverter
@@ -142,63 +145,59 @@ def test_legacy_import_path() -> None:
142
145
  )
143
146
 
144
147
 
145
- def test_component_from_dict_legacy_nulls() -> None:
146
- # Before the public-attribute refactor, default serialization couldn't find
147
- # the _-prefixed attributes and fell back to the init defaults, so
148
- # convert_kwargs and md_export_kwargs were always serialized as null.
149
- # Verify that such a serialized dict still deserializes correctly.
150
- legacy_data = {
148
+ def test_component_to_dict_defaults() -> None:
149
+ converter = DoclingConverter()
150
+ assert converter.to_dict() == {
151
151
  "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
152
152
  "init_parameters": {
153
153
  "converter": None,
154
- "convert_kwargs": None,
155
- "export_type": "doc_chunks",
156
- "md_export_kwargs": None,
154
+ "convert_kwargs": {},
155
+ "export_type": "markdown",
156
+ "md_export_kwargs": {"image_placeholder": ""},
157
157
  "chunker": None,
158
158
  "meta_extractor": None,
159
159
  },
160
160
  }
161
- restored = component_from_dict(DoclingConverter, legacy_data, "docling_converter")
162
-
163
- assert restored.convert_kwargs == {}
164
- assert restored.md_export_kwargs == {"image_placeholder": ""}
165
- assert restored.export_type == ExportType.DOC_CHUNKS
166
- assert restored.converter is None
167
- assert restored.chunker is None
168
- assert restored.meta_extractor is None
169
-
170
-
171
- def test_component_to_dict_defaults() -> None:
172
- converter = DoclingConverter()
173
- data = component_to_dict(converter, "docling_converter")
174
-
175
- init_params = data["init_parameters"]
176
- assert init_params["converter"] is None
177
- assert init_params["convert_kwargs"] == {}
178
- assert init_params["export_type"] == ExportType.DOC_CHUNKS
179
- assert init_params["md_export_kwargs"] == {"image_placeholder": ""}
180
- assert init_params["chunker"] is None
181
- assert init_params["meta_extractor"] is None
182
161
 
183
162
 
184
163
  def test_component_to_dict_custom_params() -> None:
185
164
  converter = DoclingConverter(
165
+ converter=DocumentConverter(),
186
166
  convert_kwargs={"raises_on_error": False},
187
167
  export_type=ExportType.MARKDOWN,
188
168
  md_export_kwargs={"image_placeholder": "[img]"},
169
+ meta_extractor=MetaExtractor(),
189
170
  )
190
- data = component_to_dict(converter, "docling_converter")
191
-
192
- init_params = data["init_parameters"]
193
- assert init_params["convert_kwargs"] == {"raises_on_error": False}
194
- assert init_params["export_type"] == ExportType.MARKDOWN
195
- assert init_params["md_export_kwargs"] == {"image_placeholder": "[img]"}
171
+ assert converter.to_dict() == {
172
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
173
+ "init_parameters": {
174
+ "converter": None,
175
+ "convert_kwargs": {"raises_on_error": False},
176
+ "export_type": "markdown",
177
+ "md_export_kwargs": {"image_placeholder": "[img]"},
178
+ "chunker": None,
179
+ "meta_extractor": {
180
+ "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
181
+ "data": {},
182
+ },
183
+ },
184
+ }
196
185
 
197
186
 
198
187
  def test_component_from_dict_defaults() -> None:
199
- converter = DoclingConverter()
200
- data = component_to_dict(converter, "docling_converter")
201
- restored = component_from_dict(DoclingConverter, data, "docling_converter")
188
+ # null kwargs mirror the pre-refactor serialization format and must still deserialize correctly
189
+ data = {
190
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
191
+ "init_parameters": {
192
+ "converter": None,
193
+ "convert_kwargs": None,
194
+ "export_type": "doc_chunks",
195
+ "md_export_kwargs": None,
196
+ "chunker": None,
197
+ "meta_extractor": None,
198
+ },
199
+ }
200
+ restored = DoclingConverter.from_dict(data)
202
201
 
203
202
  assert restored.converter is None
204
203
  assert restored.convert_kwargs == {}
@@ -209,17 +208,44 @@ def test_component_from_dict_defaults() -> None:
209
208
 
210
209
 
211
210
  def test_component_from_dict_custom_params() -> None:
212
- converter = DoclingConverter(
213
- convert_kwargs={"raises_on_error": False},
214
- export_type=ExportType.JSON,
215
- md_export_kwargs={"image_placeholder": "[img]"},
216
- )
217
- data = component_to_dict(converter, "docling_converter")
218
- restored = component_from_dict(DoclingConverter, data, "docling_converter")
211
+ data = {
212
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
213
+ "init_parameters": {
214
+ "converter": None,
215
+ "convert_kwargs": {"raises_on_error": False},
216
+ "export_type": "json",
217
+ "md_export_kwargs": {"image_placeholder": "[img]"},
218
+ "chunker": None,
219
+ "meta_extractor": {
220
+ "type": "haystack_integrations.components.converters.docling.converter.MetaExtractor",
221
+ "data": {},
222
+ },
223
+ },
224
+ }
225
+ restored = DoclingConverter.from_dict(data)
219
226
 
227
+ assert restored.converter is None
220
228
  assert restored.convert_kwargs == {"raises_on_error": False}
221
229
  assert restored.export_type == ExportType.JSON
222
230
  assert restored.md_export_kwargs == {"image_placeholder": "[img]"}
231
+ assert restored.chunker is None
232
+ assert isinstance(restored.meta_extractor, MetaExtractor)
233
+
234
+
235
+ def test_component_to_dict_chunker_warns_and_is_dropped() -> None:
236
+ converter = DoclingConverter(export_type=ExportType.DOC_CHUNKS, chunker=HybridChunker(merge_peers=False))
237
+
238
+ assert converter.to_dict() == {
239
+ "type": "haystack_integrations.components.converters.docling.converter.DoclingConverter",
240
+ "init_parameters": {
241
+ "converter": None,
242
+ "convert_kwargs": {},
243
+ "export_type": "doc_chunks",
244
+ "md_export_kwargs": {"image_placeholder": ""},
245
+ "chunker": None,
246
+ "meta_extractor": None,
247
+ },
248
+ }
223
249
 
224
250
 
225
251
  def test_run_with_sources_parameter() -> None:
@@ -441,3 +467,152 @@ class TestBytestreamToDocumentStream:
441
467
  ds = _bytestream_to_document_stream(bs)
442
468
  assert isinstance(ds, DocumentStream)
443
469
  assert isinstance(ds.stream, BytesIO)
470
+
471
+ def test_unknown_mime_type_keeps_base_name(self) -> None:
472
+ # mimetypes.guess_extension returns None for unknown types, so the name stays as-is.
473
+ assert mimetypes.guess_extension("application/x-totally-made-up-type") is None
474
+ bs = ByteStream(
475
+ data=b"data",
476
+ meta={"file_path": "report"},
477
+ mime_type="application/x-totally-made-up-type",
478
+ )
479
+ ds = _bytestream_to_document_stream(bs)
480
+ assert ds.name == "report"
481
+
482
+
483
+ class TestMetaExtractor:
484
+ def test_extract_chunk_meta_wraps_export_json_dict(self) -> None:
485
+ chunk = MagicMock()
486
+ chunk.export_json_dict.return_value = {"some": "dict"}
487
+ chunk.meta.doc_items = []
488
+
489
+ result = MetaExtractor().extract_chunk_meta(chunk=chunk)
490
+
491
+ assert result == {"dl_meta": {"some": "dict"}}
492
+ chunk.export_json_dict.assert_called_once_with()
493
+
494
+ def test_extract_chunk_meta_includes_page_number(self) -> None:
495
+ prov = MagicMock()
496
+ prov.page_no = 3
497
+ doc_item = MagicMock()
498
+ doc_item.prov = [prov]
499
+
500
+ chunk = MagicMock()
501
+ chunk.export_json_dict.return_value = {"some": "dict"}
502
+ chunk.meta.doc_items = [doc_item]
503
+
504
+ result = MetaExtractor().extract_chunk_meta(chunk=chunk)
505
+
506
+ assert result == {"dl_meta": {"some": "dict"}, "page_number": 3}
507
+
508
+ def test_extract_chunk_meta_page_number_uses_minimum(self) -> None:
509
+ prov1 = MagicMock()
510
+ prov1.page_no = 5
511
+ prov2 = MagicMock()
512
+ prov2.page_no = 3
513
+ doc_item = MagicMock()
514
+ doc_item.prov = [prov1, prov2]
515
+
516
+ chunk = MagicMock()
517
+ chunk.export_json_dict.return_value = {}
518
+ chunk.meta.doc_items = [doc_item]
519
+
520
+ result = MetaExtractor().extract_chunk_meta(chunk=chunk)
521
+
522
+ assert result["page_number"] == 3
523
+
524
+ def test_extract_dl_doc_meta_with_origin(self) -> None:
525
+ dl_doc = MagicMock()
526
+ dl_doc.origin.model_dump.return_value = {"filename": "foo.pdf", "mimetype": "application/pdf"}
527
+
528
+ result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
529
+
530
+ assert result == {"dl_meta": {"origin": {"filename": "foo.pdf", "mimetype": "application/pdf"}}}
531
+ dl_doc.origin.model_dump.assert_called_once_with(exclude_none=True)
532
+
533
+ def test_extract_dl_doc_meta_without_origin(self) -> None:
534
+ dl_doc = MagicMock()
535
+ dl_doc.origin = None
536
+
537
+ result = MetaExtractor().extract_dl_doc_meta(dl_doc=dl_doc)
538
+
539
+ assert result == {}
540
+
541
+
542
+ def test_run_without_sources_or_paths_raises_value_error() -> None:
543
+ converter = DoclingConverter(converter=MagicMock(), meta_extractor=MagicMock())
544
+ with pytest.raises(ValueError, match=r"Either 'sources' or the deprecated 'paths' parameter must be provided."):
545
+ converter.run()
546
+
547
+
548
+ def test_run_doc_chunks_split_id_and_split_idx_start() -> None:
549
+ converter_mock = MagicMock()
550
+ chunker_mock = MagicMock()
551
+ meta_extractor_mock = MagicMock()
552
+
553
+ converter_mock.convert.return_value = SimpleNamespace(document="dl-doc")
554
+
555
+ chunks = [
556
+ SimpleNamespace(text="hello world"),
557
+ SimpleNamespace(text="foo bar baz"),
558
+ ]
559
+ chunker_mock.chunk.return_value = chunks
560
+ chunker_mock.contextualize.side_effect = lambda chunk: f"ctx:{chunk.text}"
561
+ meta_extractor_mock.extract_chunk_meta.return_value = {}
562
+
563
+ converter = DoclingConverter(
564
+ converter=converter_mock,
565
+ export_type=ExportType.DOC_CHUNKS,
566
+ chunker=chunker_mock,
567
+ meta_extractor=meta_extractor_mock,
568
+ )
569
+
570
+ result = converter.run(sources=["doc.pdf"])
571
+ documents = result["documents"]
572
+
573
+ assert len(documents) == 2
574
+ assert documents[0].meta["split_id"] == 0
575
+ assert documents[0].meta["split_idx_start"] == 0
576
+ assert documents[1].meta["split_id"] == 1
577
+ assert documents[1].meta["split_idx_start"] == len("hello world")
578
+
579
+
580
+ def test_run_doc_chunks_split_id_resets_per_document() -> None:
581
+ converter_mock = MagicMock()
582
+ chunker_mock = MagicMock()
583
+ meta_extractor_mock = MagicMock()
584
+
585
+ converter_mock.convert.side_effect = [
586
+ SimpleNamespace(document="dl-doc-a"),
587
+ SimpleNamespace(document="dl-doc-b"),
588
+ ]
589
+ chunker_mock.chunk.side_effect = lambda dl_doc: [
590
+ SimpleNamespace(text=f"chunk-1-of-{dl_doc}"),
591
+ SimpleNamespace(text=f"chunk-2-of-{dl_doc}"),
592
+ ]
593
+ chunker_mock.contextualize.side_effect = lambda chunk: chunk.text
594
+ meta_extractor_mock.extract_chunk_meta.return_value = {}
595
+
596
+ converter = DoclingConverter(
597
+ converter=converter_mock,
598
+ export_type=ExportType.DOC_CHUNKS,
599
+ chunker=chunker_mock,
600
+ meta_extractor=meta_extractor_mock,
601
+ )
602
+
603
+ result = converter.run(sources=["a.pdf", "b.pdf"])
604
+ documents = result["documents"]
605
+
606
+ # split_id and split_idx_start reset for each source document
607
+ doc_a_chunks = documents[:2]
608
+ doc_b_chunks = documents[2:]
609
+
610
+ assert doc_a_chunks[0].meta["split_id"] == 0
611
+ assert doc_a_chunks[0].meta["split_idx_start"] == 0
612
+ assert doc_a_chunks[1].meta["split_id"] == 1
613
+ assert doc_a_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-a")
614
+
615
+ assert doc_b_chunks[0].meta["split_id"] == 0
616
+ assert doc_b_chunks[0].meta["split_idx_start"] == 0
617
+ assert doc_b_chunks[1].meta["split_id"] == 1
618
+ assert doc_b_chunks[1].meta["split_idx_start"] == len("chunk-1-of-dl-doc-b")
@@ -1,13 +0,0 @@
1
- # Changelog
2
-
3
- ## [integrations/docling-v0.2.0] - 2026-04-08
4
-
5
- ### 🚀 Features
6
-
7
- - Add Docling document converter (#3066)
8
-
9
- ### 🚜 Refactor
10
-
11
- - *(docling)* Add meta parameter to run(); introduce sources; deprecate paths (#3103)
12
-
13
- <!-- generated by git-cliff -->