dewey-haystack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dewey_haystack-0.1.0.dist-info/METADATA +199 -0
- dewey_haystack-0.1.0.dist-info/RECORD +9 -0
- dewey_haystack-0.1.0.dist-info/WHEEL +5 -0
- dewey_haystack-0.1.0.dist-info/top_level.txt +1 -0
- haystack_integrations/components/retrievers/dewey/__init__.py +8 -0
- haystack_integrations/components/retrievers/dewey/dewey_research_component.py +162 -0
- haystack_integrations/components/retrievers/dewey/dewey_retriever.py +116 -0
- haystack_integrations/document_stores/dewey/__init__.py +5 -0
- haystack_integrations/document_stores/dewey/dewey_document_store.py +210 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dewey-haystack
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Haystack integration for Dewey — document store, retriever, and research component
|
|
5
|
+
Author-email: Dewey <hi@meetdewey.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://meetdewey.com
|
|
8
|
+
Project-URL: Repository, https://github.com/meetdewey/dewey-haystack
|
|
9
|
+
Keywords: haystack,dewey,rag,retrieval,document-store,llm
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: meetdewey>=1.0
|
|
21
|
+
Requires-Dist: haystack-ai>=2.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
24
|
+
Requires-Dist: pytest-mock>=3; extra == "dev"
|
|
25
|
+
|
|
26
|
+
# dewey-haystack
|
|
27
|
+
|
|
28
|
+
[](https://github.com/meetdewey/dewey-haystack/actions/workflows/ci.yml)
|
|
29
|
+
|
|
30
|
+
[Haystack](https://haystack.deepset.ai/) integration for [Dewey](https://meetdewey.com) — document store, retriever, and research component.
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install dewey-haystack
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Components
|
|
39
|
+
|
|
40
|
+
### DeweyDocumentStore
|
|
41
|
+
|
|
42
|
+
Haystack DocumentStore backed by a Dewey collection. Handles document upload and deletion; Dewey manages chunking and embeddings automatically.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from haystack_integrations.document_stores.dewey import DeweyDocumentStore
|
|
46
|
+
from haystack.utils import Secret
|
|
47
|
+
|
|
48
|
+
store = DeweyDocumentStore(
|
|
49
|
+
api_key=Secret.from_env_var("DEWEY_API_KEY"),
|
|
50
|
+
collection_id="3f7a1b2c-...",
|
|
51
|
+
)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Upload Haystack Documents:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from haystack import Document
|
|
58
|
+
|
|
59
|
+
store.write_documents([
|
|
60
|
+
Document(content="Neural networks learn via backpropagation.", meta={"source": "ml.txt"}),
|
|
61
|
+
Document(content="Transformers use self-attention mechanisms."),
|
|
62
|
+
])
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### DeweyRetriever
|
|
66
|
+
|
|
67
|
+
Drop-in Haystack retriever backed by Dewey's hybrid semantic + BM25 search.
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from haystack import Pipeline
|
|
71
|
+
from haystack_integrations.document_stores.dewey import DeweyDocumentStore
|
|
72
|
+
from haystack_integrations.components.retrievers.dewey import DeweyRetriever
|
|
73
|
+
from haystack.utils import Secret
|
|
74
|
+
|
|
75
|
+
store = DeweyDocumentStore(
|
|
76
|
+
api_key=Secret.from_env_var("DEWEY_API_KEY"),
|
|
77
|
+
collection_id="3f7a1b2c-...",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
pipeline = Pipeline()
|
|
81
|
+
pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=8))
|
|
82
|
+
|
|
83
|
+
result = pipeline.run({"retriever": {"query": "What are the key findings?"}})
|
|
84
|
+
for doc in result["retriever"]["documents"]:
|
|
85
|
+
print(f"[{doc.meta['filename']}] {doc.content}")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Each returned `Document` carries citation metadata:
|
|
89
|
+
|
|
90
|
+
| Field | Description |
|
|
91
|
+
|---|---|
|
|
92
|
+
| `score` | Relevance score (0–1) |
|
|
93
|
+
| `document_id` | Dewey document ID |
|
|
94
|
+
| `filename` | Original filename |
|
|
95
|
+
| `section_id` | Section ID |
|
|
96
|
+
| `section_title` | Section heading |
|
|
97
|
+
| `section_level` | Heading depth (1 = top-level) |
|
|
98
|
+
|
|
99
|
+
**RAG pipeline with an LLM:**
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from haystack.components.builders import PromptBuilder
|
|
103
|
+
from haystack.components.generators import OpenAIGenerator
|
|
104
|
+
|
|
105
|
+
prompt_template = """
|
|
106
|
+
Answer the question using only the provided context.
|
|
107
|
+
|
|
108
|
+
Context:
|
|
109
|
+
{% for doc in documents %}
|
|
110
|
+
- {{ doc.content }}
|
|
111
|
+
{% endfor %}
|
|
112
|
+
|
|
113
|
+
Question: {{ query }}
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
pipeline = Pipeline()
|
|
117
|
+
pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=5))
|
|
118
|
+
pipeline.add_component("prompt", PromptBuilder(template=prompt_template))
|
|
119
|
+
pipeline.add_component("llm", OpenAIGenerator(model="gpt-4o-mini"))
|
|
120
|
+
|
|
121
|
+
pipeline.connect("retriever.documents", "prompt.documents")
|
|
122
|
+
pipeline.connect("prompt.prompt", "llm.prompt")
|
|
123
|
+
|
|
124
|
+
result = pipeline.run({
|
|
125
|
+
"retriever": {"query": "What were the main findings?"},
|
|
126
|
+
"prompt": {"query": "What were the main findings?"},
|
|
127
|
+
})
|
|
128
|
+
print(result["llm"]["replies"][0])
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### DeweyResearchComponent
|
|
132
|
+
|
|
133
|
+
A Haystack component that runs Dewey's full agentic research loop — searching, reading, and synthesising across multiple documents — and returns a grounded Markdown answer with cited sources.
|
|
134
|
+
|
|
135
|
+
Use this as a drop-in replacement for an LLM generator when you want Dewey to handle both retrieval *and* generation.
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
from haystack import Pipeline
|
|
139
|
+
from haystack_integrations.components.retrievers.dewey import DeweyResearchComponent
|
|
140
|
+
from haystack.utils import Secret
|
|
141
|
+
|
|
142
|
+
pipeline = Pipeline()
|
|
143
|
+
pipeline.add_component(
|
|
144
|
+
"research",
|
|
145
|
+
DeweyResearchComponent(
|
|
146
|
+
api_key=Secret.from_env_var("DEWEY_API_KEY"),
|
|
147
|
+
collection_id="3f7a1b2c-...",
|
|
148
|
+
depth="balanced",
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
result = pipeline.run({"research": {"query": "What were the key findings across all studies?"}})
|
|
153
|
+
print(result["research"]["answer"])
|
|
154
|
+
|
|
155
|
+
for source in result["research"]["sources"]:
|
|
156
|
+
print(f" [{source.meta['filename']}] {source.content[:80]}...")
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
**Outputs:**
|
|
160
|
+
|
|
161
|
+
| Key | Type | Description |
|
|
162
|
+
|---|---|---|
|
|
163
|
+
| `answer` | `str` | Synthesised Markdown answer |
|
|
164
|
+
| `sources` | `list[Document]` | Source chunks cited by the answer |
|
|
165
|
+
|
|
166
|
+
**Research depths:**
|
|
167
|
+
|
|
168
|
+
| depth | Speed | Tools | Requires BYOK |
|
|
169
|
+
|---|---|---|---|
|
|
170
|
+
| `quick` | fast | basic search | no |
|
|
171
|
+
| `balanced` | fast | basic search | no |
|
|
172
|
+
| `deep` | slower | full tool suite | yes |
|
|
173
|
+
| `exhaustive` | slowest | full tool suite | yes |
|
|
174
|
+
|
|
175
|
+
`deep` and `exhaustive` require a Dewey Pro plan and a BYOK API key configured on your project.
|
|
176
|
+
|
|
177
|
+
**With a custom model:**
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
DeweyResearchComponent(
|
|
181
|
+
api_key=Secret.from_env_var("DEWEY_API_KEY"),
|
|
182
|
+
collection_id="3f7a1b2c-...",
|
|
183
|
+
depth="deep",
|
|
184
|
+
model="claude-sonnet-4-6", # requires Anthropic BYOK key on your project
|
|
185
|
+
)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Requirements
|
|
189
|
+
|
|
190
|
+
- Python 3.9+
|
|
191
|
+
- `meetdewey >= 1.0`
|
|
192
|
+
- `haystack-ai >= 2.0`
|
|
193
|
+
|
|
194
|
+
## Development
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
pip install -e ".[dev]"
|
|
198
|
+
pytest
|
|
199
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
haystack_integrations/components/retrievers/dewey/__init__.py,sha256=NmjzgaPoebXfq31dse_dH-7VK-jTOJQxnpfs81Fw_T8,277
|
|
2
|
+
haystack_integrations/components/retrievers/dewey/dewey_research_component.py,sha256=WIQ6odb_835QdvWJR6eDulKujoT8_jRLsrrujUpk5kE,6190
|
|
3
|
+
haystack_integrations/components/retrievers/dewey/dewey_retriever.py,sha256=lFPXpNauvT0ot-q5teBS29o5xksMJBQor9HMuU7hNDk,4126
|
|
4
|
+
haystack_integrations/document_stores/dewey/__init__.py,sha256=8i96VIg-8vh1oWZENMMBuZtMJQo4ho-J31k7tJSxU7o,139
|
|
5
|
+
haystack_integrations/document_stores/dewey/dewey_document_store.py,sha256=GGpJ0CTvhQW2Ir522Sfwi4tOVMd1tpYFupyBK0LD9dQ,8391
|
|
6
|
+
dewey_haystack-0.1.0.dist-info/METADATA,sha256=PWDpEDxyt1jyMLjD1GFRLIXn7nBmGb_izxzY0c5zd7I,5901
|
|
7
|
+
dewey_haystack-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
8
|
+
dewey_haystack-0.1.0.dist-info/top_level.txt,sha256=xEV0YMHti-yzzXgkLgSzsILiqsuC91-4ShNpDgsKNRI,22
|
|
9
|
+
dewey_haystack-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
haystack_integrations
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
from haystack_integrations.components.retrievers.dewey.dewey_research_component import (
|
|
2
|
+
DeweyResearchComponent,
|
|
3
|
+
)
|
|
4
|
+
from haystack_integrations.components.retrievers.dewey.dewey_retriever import (
|
|
5
|
+
DeweyRetriever,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
__all__ = ["DeweyRetriever", "DeweyResearchComponent"]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""DeweyResearchComponent — Haystack component that runs a full Dewey research query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
8
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@component
|
|
12
|
+
class DeweyResearchComponent:
|
|
13
|
+
"""Haystack component that performs agentic research over a Dewey collection.
|
|
14
|
+
|
|
15
|
+
Unlike :class:`DeweyRetriever` (single-shot search), this component runs
|
|
16
|
+
Dewey's full multi-step research loop: it searches, reads, and synthesises
|
|
17
|
+
content across multiple documents, returning a grounded Markdown answer and
|
|
18
|
+
the source chunks it cited.
|
|
19
|
+
|
|
20
|
+
This component is a drop-in replacement for an LLM generator in a Haystack
|
|
21
|
+
pipeline when you want Dewey to handle both retrieval *and* generation.
|
|
22
|
+
|
|
23
|
+
Example — research pipeline::
|
|
24
|
+
|
|
25
|
+
from haystack import Pipeline
|
|
26
|
+
from haystack_integrations.components.retrievers.dewey import DeweyResearchComponent
|
|
27
|
+
from haystack.utils import Secret
|
|
28
|
+
|
|
29
|
+
pipeline = Pipeline()
|
|
30
|
+
pipeline.add_component(
|
|
31
|
+
"research",
|
|
32
|
+
DeweyResearchComponent(
|
|
33
|
+
api_key=Secret.from_env_var("DEWEY_API_KEY"),
|
|
34
|
+
collection_id="3f7a1b2c-...",
|
|
35
|
+
depth="balanced",
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
result = pipeline.run({"research": {"query": "What were the key findings?"}})
|
|
40
|
+
print(result["research"]["answer"])
|
|
41
|
+
for source in result["research"]["sources"]:
|
|
42
|
+
print(f" [{source.meta['filename']}] {source.content[:80]}...")
|
|
43
|
+
|
|
44
|
+
Research depths:
|
|
45
|
+
|
|
46
|
+
+-------------+------+-------+-------+
|
|
47
|
+
| depth | fast | tools | BYOK |
|
|
48
|
+
+=============+======+=======+=======+
|
|
49
|
+
| quick | yes | basic | no |
|
|
50
|
+
+-------------+------+-------+-------+
|
|
51
|
+
| balanced | yes | basic | no |
|
|
52
|
+
+-------------+------+-------+-------+
|
|
53
|
+
| deep | no | full | yes |
|
|
54
|
+
+-------------+------+-------+-------+
|
|
55
|
+
| exhaustive | no | full | yes |
|
|
56
|
+
+-------------+------+-------+-------+
|
|
57
|
+
|
|
58
|
+
``deep`` and ``exhaustive`` depths require a Pro plan and a BYOK API key
|
|
59
|
+
configured on your Dewey project.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
api_key: Secret = Secret.from_env_var("DEWEY_API_KEY"), # noqa: B008
|
|
65
|
+
collection_id: str = "",
|
|
66
|
+
depth: str = "balanced",
|
|
67
|
+
model: Optional[str] = None,
|
|
68
|
+
base_url: str = "https://api.meetdewey.com/v1",
|
|
69
|
+
) -> None:
|
|
70
|
+
"""
|
|
71
|
+
Args:
|
|
72
|
+
api_key: Dewey project API key. Defaults to the ``DEWEY_API_KEY``
|
|
73
|
+
environment variable.
|
|
74
|
+
collection_id: ID of the Dewey collection to research.
|
|
75
|
+
depth: Research depth — ``"quick"``, ``"balanced"`` (default),
|
|
76
|
+
``"deep"``, or ``"exhaustive"``.
|
|
77
|
+
model: Optional model override (e.g. ``"gpt-4o"``,
|
|
78
|
+
``"claude-sonnet-4-6"``). Defaults to Dewey's server-side
|
|
79
|
+
default for the chosen depth.
|
|
80
|
+
base_url: Dewey API base URL. Override for local development.
|
|
81
|
+
"""
|
|
82
|
+
if depth not in ("quick", "balanced", "deep", "exhaustive"):
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Invalid depth {depth!r}. Must be one of: quick, balanced, deep, exhaustive"
|
|
85
|
+
)
|
|
86
|
+
self.api_key = api_key
|
|
87
|
+
self.collection_id = collection_id
|
|
88
|
+
self.depth = depth
|
|
89
|
+
self.model = model
|
|
90
|
+
self.base_url = base_url
|
|
91
|
+
self._client: Any = None
|
|
92
|
+
|
|
93
|
+
def _get_client(self) -> Any:
|
|
94
|
+
if self._client is None:
|
|
95
|
+
from dewey import DeweyClient
|
|
96
|
+
|
|
97
|
+
self._client = DeweyClient(
|
|
98
|
+
api_key=self.api_key.resolve_value(), base_url=self.base_url
|
|
99
|
+
)
|
|
100
|
+
return self._client
|
|
101
|
+
|
|
102
|
+
@component.output_types(answer=str, sources=List[Document])
|
|
103
|
+
def run(self, query: str) -> Dict[str, Any]:
|
|
104
|
+
"""Run a research query against the collection.
|
|
105
|
+
|
|
106
|
+
Streams the Dewey research SSE endpoint internally and returns the
|
|
107
|
+
completed answer once the stream closes.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
query: The research question.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
A dict with:
|
|
114
|
+
- ``"answer"`` (``str``): The synthesised Markdown answer.
|
|
115
|
+
- ``"sources"`` (``list[Document]``): Source chunks cited, each
|
|
116
|
+
with ``content`` and metadata (``filename``, ``document_id``,
|
|
117
|
+
``section_id``, ``section_title``, ``section_level``).
|
|
118
|
+
"""
|
|
119
|
+
client = self._get_client()
|
|
120
|
+
kwargs: Dict[str, Any] = {"depth": self.depth}
|
|
121
|
+
if self.model:
|
|
122
|
+
kwargs["model"] = self.model
|
|
123
|
+
|
|
124
|
+
answer_parts: List[str] = []
|
|
125
|
+
sources: List[Document] = []
|
|
126
|
+
|
|
127
|
+
for event in client.research.stream(self.collection_id, query, **kwargs):
|
|
128
|
+
if event.type == "chunk":
|
|
129
|
+
answer_parts.append(event.content)
|
|
130
|
+
elif event.type == "done":
|
|
131
|
+
sources = [
|
|
132
|
+
Document(
|
|
133
|
+
content=s.content,
|
|
134
|
+
meta={
|
|
135
|
+
"document_id": s.documentId,
|
|
136
|
+
"filename": s.filename,
|
|
137
|
+
"section_id": s.sectionId,
|
|
138
|
+
"section_title": s.sectionTitle,
|
|
139
|
+
"section_level": s.sectionLevel,
|
|
140
|
+
},
|
|
141
|
+
)
|
|
142
|
+
for s in (getattr(event, "sources", None) or [])
|
|
143
|
+
]
|
|
144
|
+
elif event.type == "error":
|
|
145
|
+
raise RuntimeError(f"Dewey research error: {event.message}")
|
|
146
|
+
|
|
147
|
+
return {"answer": "".join(answer_parts), "sources": sources}
|
|
148
|
+
|
|
149
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
150
|
+
return default_to_dict(
|
|
151
|
+
self,
|
|
152
|
+
api_key=self.api_key.to_dict(),
|
|
153
|
+
collection_id=self.collection_id,
|
|
154
|
+
depth=self.depth,
|
|
155
|
+
model=self.model,
|
|
156
|
+
base_url=self.base_url,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def from_dict(cls, data: Dict[str, Any]) -> "DeweyResearchComponent":
|
|
161
|
+
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
162
|
+
return default_from_dict(cls, data)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""DeweyRetriever — Haystack component backed by Dewey hybrid search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
8
|
+
|
|
9
|
+
from haystack_integrations.document_stores.dewey import DeweyDocumentStore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@component
|
|
13
|
+
class DeweyRetriever:
|
|
14
|
+
"""Haystack retriever that queries a Dewey collection.
|
|
15
|
+
|
|
16
|
+
Uses Dewey's hybrid semantic + BM25 search and returns chunks as Haystack
|
|
17
|
+
``Document`` objects with full citation metadata.
|
|
18
|
+
|
|
19
|
+
Must be paired with a :class:`~haystack_integrations.document_stores.dewey.DeweyDocumentStore`.
|
|
20
|
+
|
|
21
|
+
Example — standalone retrieval pipeline::
|
|
22
|
+
|
|
23
|
+
from haystack import Pipeline
|
|
24
|
+
from haystack_integrations.document_stores.dewey import DeweyDocumentStore
|
|
25
|
+
from haystack_integrations.components.retrievers.dewey import DeweyRetriever
|
|
26
|
+
from haystack.utils import Secret
|
|
27
|
+
|
|
28
|
+
store = DeweyDocumentStore(
|
|
29
|
+
api_key=Secret.from_env_var("DEWEY_API_KEY"),
|
|
30
|
+
collection_id="3f7a1b2c-...",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
pipeline = Pipeline()
|
|
34
|
+
pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=8))
|
|
35
|
+
|
|
36
|
+
result = pipeline.run({"retriever": {"query": "What are the key findings?"}})
|
|
37
|
+
for doc in result["retriever"]["documents"]:
|
|
38
|
+
print(doc.content, doc.meta["score"])
|
|
39
|
+
|
|
40
|
+
Example — RAG pipeline with an LLM::
|
|
41
|
+
|
|
42
|
+
from haystack.components.builders import PromptBuilder
|
|
43
|
+
from haystack.components.generators import OpenAIGenerator
|
|
44
|
+
|
|
45
|
+
prompt_template = \"\"\"
|
|
46
|
+
Answer the question using the provided context.
|
|
47
|
+
Context:
|
|
48
|
+
{% for doc in documents %}{{ doc.content }}{% endfor %}
|
|
49
|
+
Question: {{ query }}
|
|
50
|
+
\"\"\"
|
|
51
|
+
|
|
52
|
+
pipeline = Pipeline()
|
|
53
|
+
pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=5))
|
|
54
|
+
pipeline.add_component("prompt", PromptBuilder(template=prompt_template))
|
|
55
|
+
pipeline.add_component("llm", OpenAIGenerator(model="gpt-4o-mini"))
|
|
56
|
+
|
|
57
|
+
pipeline.connect("retriever.documents", "prompt.documents")
|
|
58
|
+
pipeline.connect("prompt.prompt", "llm.prompt")
|
|
59
|
+
|
|
60
|
+
result = pipeline.run({
|
|
61
|
+
"retriever": {"query": "summarise the main themes"},
|
|
62
|
+
"prompt": {"query": "summarise the main themes"},
|
|
63
|
+
})
|
|
64
|
+
print(result["llm"]["replies"][0])
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
document_store: DeweyDocumentStore,
|
|
70
|
+
top_k: int = 10,
|
|
71
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
72
|
+
) -> None:
|
|
73
|
+
"""
|
|
74
|
+
Args:
|
|
75
|
+
document_store: A :class:`DeweyDocumentStore` instance.
|
|
76
|
+
top_k: Maximum number of chunks to return (1–50).
|
|
77
|
+
filters: Unused — present for interface compatibility. Dewey does
|
|
78
|
+
not support metadata filtering at retrieval time.
|
|
79
|
+
"""
|
|
80
|
+
self.document_store = document_store
|
|
81
|
+
self.top_k = top_k
|
|
82
|
+
self.filters = filters
|
|
83
|
+
|
|
84
|
+
@component.output_types(documents=List[Document])
|
|
85
|
+
def run(
|
|
86
|
+
self,
|
|
87
|
+
query: str,
|
|
88
|
+
top_k: Optional[int] = None,
|
|
89
|
+
) -> Dict[str, List[Document]]:
|
|
90
|
+
"""Run the retriever.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
query: The search query.
|
|
94
|
+
top_k: Override the instance-level ``top_k`` for this call.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
A dict with a single key ``"documents"`` containing a list of
|
|
98
|
+
:class:`~haystack.Document` objects ordered by relevance score.
|
|
99
|
+
"""
|
|
100
|
+
k = top_k if top_k is not None else self.top_k
|
|
101
|
+
documents = self.document_store._search(query, k)
|
|
102
|
+
return {"documents": documents}
|
|
103
|
+
|
|
104
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
105
|
+
return default_to_dict(
|
|
106
|
+
self,
|
|
107
|
+
document_store=self.document_store.to_dict(),
|
|
108
|
+
top_k=self.top_k,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def from_dict(cls, data: Dict[str, Any]) -> "DeweyRetriever":
|
|
113
|
+
data["init_parameters"]["document_store"] = DeweyDocumentStore.from_dict(
|
|
114
|
+
data["init_parameters"]["document_store"]
|
|
115
|
+
)
|
|
116
|
+
return default_from_dict(cls, data)
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""DeweyDocumentStore — Haystack DocumentStore backed by a Dewey collection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from haystack import Document, default_from_dict, default_to_dict
|
|
10
|
+
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
|
11
|
+
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DeweyDocumentStore:
|
|
15
|
+
"""Haystack DocumentStore backed by a Dewey collection.
|
|
16
|
+
|
|
17
|
+
Dewey manages its own chunking and embeddings. Each Haystack ``Document``
|
|
18
|
+
written to the store is uploaded as a plain-text file and processed by
|
|
19
|
+
Dewey's ingestion pipeline. Use :class:`DeweyRetriever` to query the
|
|
20
|
+
store — ``filter_documents`` is intentionally limited (see below).
|
|
21
|
+
|
|
22
|
+
Example::
|
|
23
|
+
|
|
24
|
+
from haystack_integrations.document_stores.dewey import DeweyDocumentStore
|
|
25
|
+
from haystack_integrations.components.retrievers.dewey import DeweyRetriever
|
|
26
|
+
from haystack.utils import Secret
|
|
27
|
+
|
|
28
|
+
store = DeweyDocumentStore(
|
|
29
|
+
api_key=Secret.from_env_var("DEWEY_API_KEY"),
|
|
30
|
+
collection_id="3f7a1b2c-...",
|
|
31
|
+
)
|
|
32
|
+
retriever = DeweyRetriever(document_store=store, top_k=8)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
api_key: Secret = Secret.from_env_var("DEWEY_API_KEY"), # noqa: B008
|
|
38
|
+
collection_id: str = "",
|
|
39
|
+
base_url: str = "https://api.meetdewey.com/v1",
|
|
40
|
+
poll_interval: float = 2.0,
|
|
41
|
+
poll_timeout: float = 300.0,
|
|
42
|
+
) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Args:
|
|
45
|
+
api_key: Dewey project API key. Defaults to the ``DEWEY_API_KEY``
|
|
46
|
+
environment variable.
|
|
47
|
+
collection_id: ID of the Dewey collection to use.
|
|
48
|
+
base_url: Dewey API base URL. Override for local development.
|
|
49
|
+
poll_interval: Seconds between status polls in ``write_documents``.
|
|
50
|
+
poll_timeout: Maximum seconds to wait for documents to be ready.
|
|
51
|
+
"""
|
|
52
|
+
self.api_key = api_key
|
|
53
|
+
self.collection_id = collection_id
|
|
54
|
+
self.base_url = base_url
|
|
55
|
+
self.poll_interval = poll_interval
|
|
56
|
+
self.poll_timeout = poll_timeout
|
|
57
|
+
self._client: Any = None
|
|
58
|
+
|
|
59
|
+
def _get_client(self) -> Any:
|
|
60
|
+
if self._client is None:
|
|
61
|
+
from dewey import DeweyClient
|
|
62
|
+
|
|
63
|
+
self._client = DeweyClient(
|
|
64
|
+
api_key=self.api_key.resolve_value(), base_url=self.base_url
|
|
65
|
+
)
|
|
66
|
+
return self._client
|
|
67
|
+
|
|
68
|
+
# ── DocumentStore protocol ────────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
def count_documents(self) -> int:
|
|
71
|
+
"""Return the number of documents (files) in the collection."""
|
|
72
|
+
client = self._get_client()
|
|
73
|
+
docs = client.documents.list(self.collection_id)
|
|
74
|
+
return len(docs)
|
|
75
|
+
|
|
76
|
+
def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
|
77
|
+
"""Return documents matching ``filters``.
|
|
78
|
+
|
|
79
|
+
Dewey does not support arbitrary metadata filtering. Passing ``filters``
|
|
80
|
+
will raise ``NotImplementedError``. Use :class:`DeweyRetriever` for
|
|
81
|
+
semantic search instead.
|
|
82
|
+
|
|
83
|
+
Calling with ``filters=None`` returns an empty list — use
|
|
84
|
+
``count_documents`` to check collection size, and ``DeweyRetriever``
|
|
85
|
+
to retrieve content.
|
|
86
|
+
"""
|
|
87
|
+
if filters is not None:
|
|
88
|
+
raise NotImplementedError(
|
|
89
|
+
"DeweyDocumentStore does not support metadata filters. "
|
|
90
|
+
"Use DeweyRetriever for semantic search."
|
|
91
|
+
)
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
def write_documents(
|
|
95
|
+
self,
|
|
96
|
+
documents: List[Document],
|
|
97
|
+
policy: DuplicatePolicy = DuplicatePolicy.NONE,
|
|
98
|
+
) -> int:
|
|
99
|
+
"""Upload documents to the Dewey collection and wait for them to be ready.
|
|
100
|
+
|
|
101
|
+
Each Haystack ``Document`` is uploaded as a ``.txt`` file. The document's
|
|
102
|
+
``id`` is used as the filename when no ``"source"`` key is present in
|
|
103
|
+
``meta``. Dewey handles chunking and embedding automatically.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
documents: Haystack Documents to upload. Each must have non-empty
|
|
107
|
+
``content``.
|
|
108
|
+
policy: Duplicate handling policy. Only ``NONE`` and ``OVERWRITE``
|
|
109
|
+
are meaningful — Dewey deduplicates by content hash.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Number of documents successfully submitted for ingestion.
|
|
113
|
+
"""
|
|
114
|
+
client = self._get_client()
|
|
115
|
+
uploaded_ids: List[str] = []
|
|
116
|
+
|
|
117
|
+
for doc in documents:
|
|
118
|
+
if not doc.content:
|
|
119
|
+
continue
|
|
120
|
+
source = doc.meta.get("source") or doc.meta.get("filename") or f"{doc.id}.txt"
|
|
121
|
+
if "." not in source.split("/")[-1]:
|
|
122
|
+
source = f"{source}.txt"
|
|
123
|
+
|
|
124
|
+
uploaded = client.documents.upload(
|
|
125
|
+
self.collection_id,
|
|
126
|
+
io.BytesIO(doc.content.encode("utf-8")),
|
|
127
|
+
filename=source,
|
|
128
|
+
content_type="text/plain",
|
|
129
|
+
)
|
|
130
|
+
uploaded_ids.append(uploaded.id)
|
|
131
|
+
|
|
132
|
+
self._wait_for_ready(uploaded_ids)
|
|
133
|
+
return len(uploaded_ids)
|
|
134
|
+
|
|
135
|
+
def delete_documents(self, document_ids: List[str]) -> None:
|
|
136
|
+
"""Delete documents from the collection by their Dewey document IDs."""
|
|
137
|
+
client = self._get_client()
|
|
138
|
+
for doc_id in document_ids:
|
|
139
|
+
client.documents.delete(self.collection_id, doc_id)
|
|
140
|
+
|
|
141
|
+
# ── Search (used by DeweyRetriever) ───────────────────────────────────────
|
|
142
|
+
|
|
143
|
+
def _wait_for_ready(self, ids: List[str]) -> None:
|
|
144
|
+
"""Poll until all documents in ``ids`` reach ``ready`` status."""
|
|
145
|
+
client = self._get_client()
|
|
146
|
+
pending = set(ids)
|
|
147
|
+
deadline = time.monotonic() + self.poll_timeout
|
|
148
|
+
|
|
149
|
+
while pending and time.monotonic() < deadline:
|
|
150
|
+
still_pending: set[str] = set()
|
|
151
|
+
for doc_id in pending:
|
|
152
|
+
doc = client.documents.get(self.collection_id, doc_id)
|
|
153
|
+
if doc.status == "error":
|
|
154
|
+
raise RuntimeError(
|
|
155
|
+
f"Document {doc_id} failed to process: {getattr(doc, 'errorMessage', 'unknown error')}"
|
|
156
|
+
)
|
|
157
|
+
if doc.status != "ready":
|
|
158
|
+
still_pending.add(doc_id)
|
|
159
|
+
pending = still_pending
|
|
160
|
+
if pending:
|
|
161
|
+
time.sleep(self.poll_interval)
|
|
162
|
+
|
|
163
|
+
if pending:
|
|
164
|
+
raise TimeoutError(
|
|
165
|
+
f"{len(pending)} document(s) not ready after {self.poll_timeout}s"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _search(self, query: str, top_k: int) -> List[Document]:
|
|
169
|
+
"""Run a hybrid search and return Haystack Documents with metadata."""
|
|
170
|
+
client = self._get_client()
|
|
171
|
+
results = client.retrieval.query(self.collection_id, query, limit=top_k)
|
|
172
|
+
return [_result_to_document(r) for r in results]
|
|
173
|
+
|
|
174
|
+
# ── Serialization ─────────────────────────────────────────────────────────
|
|
175
|
+
|
|
176
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
177
|
+
return default_to_dict(
|
|
178
|
+
self,
|
|
179
|
+
api_key=self.api_key.to_dict(),
|
|
180
|
+
collection_id=self.collection_id,
|
|
181
|
+
base_url=self.base_url,
|
|
182
|
+
poll_interval=self.poll_interval,
|
|
183
|
+
poll_timeout=self.poll_timeout,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
@classmethod
|
|
187
|
+
def from_dict(cls, data: Dict[str, Any]) -> "DeweyDocumentStore":
|
|
188
|
+
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
189
|
+
return default_from_dict(cls, data)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _result_to_document(r: Any) -> Document:
|
|
196
|
+
return Document(
|
|
197
|
+
content=r.chunk.content,
|
|
198
|
+
meta={
|
|
199
|
+
"score": r.score,
|
|
200
|
+
"document_id": r.document.id,
|
|
201
|
+
"filename": r.document.filename,
|
|
202
|
+
"section_id": r.section.id,
|
|
203
|
+
"section_title": r.section.title,
|
|
204
|
+
"section_level": r.section.level,
|
|
205
|
+
},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# Register the protocol compliance check at import time
|
|
210
|
+
DocumentStore.register(DeweyDocumentStore) # type: ignore[attr-defined]
|