docling-haystack 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 International Business Machines
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling-haystack
3
+ Version: 0.1.1
4
+ Summary: Docling Haystack converter
5
+ Author: Panos Vagenas
6
+ Author-email: pva@zurich.ibm.com
7
+ Requires-Python: >=3.9,<3.13
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Dist: docling (>=2.9.0,<3.0.0)
14
+ Requires-Dist: haystack-ai (>=2.8.0,<3.0.0)
15
+ Description-Content-Type: text/markdown
16
+
17
+ # Docling Haystack converter
18
+
19
+ [![PyPI version](https://img.shields.io/pypi/v/docling-haystack)](https://pypi.org/project/docling-haystack/)
20
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-haystack)](https://pypi.org/project/docling-haystack/)
21
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
22
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
23
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
24
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
25
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
26
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
27
+
28
+ A [Docling](https://github.com/DS4SD/docling) converter integration for
29
+ [Haystack](https://github.com/deepset-ai/haystack/).
30
+
31
+ ## Installation
32
+
33
+ Simply install `docling-haystack` from your package manager, e.g. pip:
34
+ ```bash
35
+ pip install docling-haystack
36
+ ```
37
+
38
+ ## Usage
39
+
40
+ Basic usage in a Haystack pipeline looks as follows:
41
+
42
+ ```python
43
+ from haystack import Pipeline
44
+ from docling_haystack.converter import DoclingConverter
45
+
46
+ idx_pipe = Pipeline()
47
+ # ...
48
+ converter = DoclingConverter()
49
+ idx_pipe.add_component("converter", converter)
50
+ # ...
51
+ ```
52
+
53
+ For end-to-end usage samples check out the [examples](examples/).
54
+
@@ -0,0 +1,37 @@
1
+ # Docling Haystack converter
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/docling-haystack)](https://pypi.org/project/docling-haystack/)
4
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-haystack)](https://pypi.org/project/docling-haystack/)
5
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
6
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
7
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
8
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
9
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
10
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
11
+
12
+ A [Docling](https://github.com/DS4SD/docling) converter integration for
13
+ [Haystack](https://github.com/deepset-ai/haystack/).
14
+
15
+ ## Installation
16
+
17
+ Simply install `docling-haystack` from your package manager, e.g. pip:
18
+ ```bash
19
+ pip install docling-haystack
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ Basic usage in a Haystack pipeline looks as follows:
25
+
26
+ ```python
27
+ from haystack import Pipeline
28
+ from docling_haystack.converter import DoclingConverter
29
+
30
+ idx_pipe = Pipeline()
31
+ # ...
32
+ converter = DoclingConverter()
33
+ idx_pipe.add_component("converter", converter)
34
+ # ...
35
+ ```
36
+
37
+ For end-to-end usage samples check out the [examples](examples/).
@@ -0,0 +1,7 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ """Docling Haystack package."""
6
+
7
+ from docling_haystack.converter import DoclingConverter
@@ -0,0 +1,141 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Docling Haystack converter module."""
7
+
8
+ from abc import ABC, abstractmethod
9
+ from enum import Enum
10
+ from pathlib import Path
11
+ from typing import Any, Iterable, Optional, Union
12
+
13
+ from docling.chunking import BaseChunk, BaseChunker, HybridChunker
14
+ from docling.datamodel.document import DoclingDocument
15
+ from docling.document_converter import DocumentConverter
16
+ from haystack import Document, component
17
+
18
+
19
+ class ExportType(str, Enum):
20
+ """Enumeration of available export types."""
21
+
22
+ MARKDOWN = "markdown"
23
+ DOC_CHUNKS = "doc_chunks"
24
+
25
+
26
+ class BaseMetaExtractor(ABC):
27
+ """BaseMetaExtractor."""
28
+
29
+ @abstractmethod
30
+ def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]:
31
+ """Extract chunk meta."""
32
+ raise NotImplementedError()
33
+
34
+ @abstractmethod
35
+ def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
36
+ """Extract Docling document meta."""
37
+ raise NotImplementedError()
38
+
39
+
40
+ class MetaExtractor(BaseMetaExtractor):
41
+ """MetaExtractor."""
42
+
43
+ def extract_chunk_meta(self, chunk: BaseChunk) -> dict[str, Any]:
44
+ """Extract chunk meta."""
45
+ return {"dl_meta": chunk.export_json_dict()}
46
+
47
+ def extract_dl_doc_meta(self, dl_doc: DoclingDocument) -> dict[str, Any]:
48
+ """Extract Docling document meta."""
49
+ return (
50
+ {"dl_meta": {"origin": dl_doc.origin.model_dump(exclude_none=True)}}
51
+ if dl_doc.origin
52
+ else {}
53
+ )
54
+
55
+
56
+ @component
57
+ class DoclingConverter:
58
+ """Docling Haystack converter."""
59
+
60
+ def __init__(
61
+ self,
62
+ converter: Optional[DocumentConverter] = None,
63
+ convert_kwargs: Optional[dict[str, Any]] = None,
64
+ export_type: ExportType = ExportType.DOC_CHUNKS,
65
+ md_export_kwargs: Optional[dict[str, Any]] = None,
66
+ chunker: Optional[BaseChunker] = None,
67
+ meta_extractor: Optional[BaseMetaExtractor] = None,
68
+ ):
69
+ """Create a Docling Haystack converter.
70
+
71
+ Args:
72
+ converter: The Docling `DocumentConverter` to use; if not set, a system
73
+ default is used.
74
+ convert_kwargs: Any parameters to pass to Docling conversion; if not set, a
75
+ system default is used.
76
+ export_type: The export mode to use: set to `ExportType.MARKDOWN` if you
77
+ want to capture each input document as a separate Haystack document, or
78
+ `ExportType.DOC_CHUNKS` (default), if you want to first have each input
79
+ document chunked and to then capture each individual chunk as a separate
80
+ Haystack document downstream.
81
+ md_export_kwargs: Any parameters to pass to Markdown export (applicable in
82
+ case of `ExportType.MARKDOWN`).
83
+ chunker: The Docling chunker instance to use; if not set, a system default
84
+ is used.
85
+ meta_extractor: The extractor instance to use for populating the output
86
+ document metadata; if not set, a system default is used.
87
+ """
88
+ self._converter = converter or DocumentConverter()
89
+ self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {}
90
+ self._export_type = export_type
91
+ self._md_export_kwargs = (
92
+ md_export_kwargs
93
+ if md_export_kwargs is not None
94
+ else {"image_placeholder": ""}
95
+ )
96
+ if self._export_type == ExportType.DOC_CHUNKS:
97
+ # TODO remove tokenizer once docling-core ^2.10.0 guaranteed via docling:
98
+ self._chunker = chunker or HybridChunker(
99
+ tokenizer="sentence-transformers/all-MiniLM-L6-v2"
100
+ )
101
+ self._meta_extractor = meta_extractor or MetaExtractor()
102
+
103
+ @component.output_types(documents=list[Document])
104
+ def run(
105
+ self,
106
+ paths: Iterable[Union[Path, str]],
107
+ ):
108
+ """Run the DoclingConverter.
109
+
110
+ Args:
111
+ paths: The input document locations, either as local paths or URLs.
112
+
113
+ Returns:
114
+ list[Document]: The output Haystack Documents.
115
+ """
116
+ documents: list[Document] = []
117
+ for filepath in paths:
118
+ dl_doc = self._converter.convert(
119
+ source=filepath,
120
+ **self._convert_kwargs,
121
+ ).document
122
+
123
+ if self._export_type == ExportType.DOC_CHUNKS:
124
+ chunk_iter = self._chunker.chunk(dl_doc=dl_doc)
125
+ hs_docs = [
126
+ Document(
127
+ content=self._chunker.serialize(chunk=chunk),
128
+ meta=self._meta_extractor.extract_chunk_meta(chunk=chunk),
129
+ )
130
+ for chunk in chunk_iter
131
+ ]
132
+ documents.extend(hs_docs)
133
+ elif self._export_type == ExportType.MARKDOWN:
134
+ hs_doc = Document(
135
+ content=dl_doc.export_to_markdown(**self._md_export_kwargs),
136
+ meta=self._meta_extractor.extract_dl_doc_meta(dl_doc=dl_doc),
137
+ )
138
+ documents.append(hs_doc)
139
+ else:
140
+ raise RuntimeError(f"Unexpected export type: {self._export_type}")
141
+ return {"documents": documents}
File without changes
@@ -0,0 +1,80 @@
1
+ [tool.poetry]
2
+ name = "docling-haystack"
3
+ version = "0.1.1" # DO NOT EDIT, updated automatically
4
+ description = "Docling Haystack converter"
5
+ authors = ["Panos Vagenas <pva@zurich.ibm.com>"]
6
+ readme = "README.md"
7
+
8
+ packages = [{ include = "docling_haystack" }]
9
+
10
+ [tool.poetry.dependencies]
11
+ python = ">=3.9,<3.13" # constraining below 3.13 due to haystack-ai 2.8.0
12
+ haystack-ai = "^2.8.0"
13
+ docling = "^2.9.0"
14
+
15
+ [tool.poetry.group.dev.dependencies]
16
+ ipykernel = "^6.29.5"
17
+ black = "^24.4.2"
18
+ pytest = "^7.1.2"
19
+ mypy = "^1.6.0"
20
+ isort = "^5.10.1"
21
+ pre-commit = "^3.7.1"
22
+ autoflake = "^2.0.0"
23
+ flake8 = "^7.1.0"
24
+ pycodestyle = "^2.10.0"
25
+ flake8-docstrings = "^1.6.0"
26
+ pep8-naming = "^0.13.2"
27
+ nbqa = "^1.9.0"
28
+ python-semantic-release = "^7.32.2"
29
+
30
+ [build-system]
31
+ requires = ["poetry-core"]
32
+ build-backend = "poetry.core.masonry.api"
33
+
34
+ [tool.black]
35
+ line-length = 88
36
+ target-version = ["py39", "py310"]
37
+ include = '\.pyi?$'
38
+ preview = true
39
+
40
+ [tool.isort]
41
+ profile = "black"
42
+ line_length = 88
43
+ py_version = 39
44
+ multi_line_output = 3
45
+ include_trailing_comma = true
46
+
47
+ [tool.autoflake]
48
+ in-place = true
49
+ ignore-init-module-imports = true
50
+ remove-all-unused-imports = true
51
+ remove-unused-variables = true
52
+ expand-star-imports = true
53
+ recursive = true
54
+
55
+ [tool.mypy]
56
+ pretty = true
57
+ no_implicit_optional = true
58
+ namespace_packages = true
59
+ show_error_codes = true
60
+ python_version = "3.9"
61
+ plugins = ["pydantic.mypy"]
62
+
63
+ [[tool.mypy.overrides]]
64
+ module = [
65
+ "haystack.*",
66
+ ]
67
+ ignore_missing_imports = true
68
+
69
+ [tool.semantic_release]
70
+ # for default values check:
71
+ # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
72
+
73
+ version_source = "tag_only"
74
+ branch = "main"
75
+
76
+ # configure types which should trigger minor and patch version bumps respectively
77
+ # (note that they must be a subset of the configured allowed types):
78
+ parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
79
+ parser_angular_minor_types = "feat"
80
+ parser_angular_patch_types = "fix,perf"