dewey-haystack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,199 @@
1
+ Metadata-Version: 2.4
2
+ Name: dewey-haystack
3
+ Version: 0.1.0
4
+ Summary: Haystack integration for Dewey — document store, retriever, and research component
5
+ Author-email: Dewey <hi@meetdewey.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://meetdewey.com
8
+ Project-URL: Repository, https://github.com/meetdewey/dewey-haystack
9
+ Keywords: haystack,dewey,rag,retrieval,document-store,llm
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.9
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: meetdewey>=1.0
21
+ Requires-Dist: haystack-ai>=2.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=7; extra == "dev"
24
+ Requires-Dist: pytest-mock>=3; extra == "dev"
25
+
26
+ # dewey-haystack
27
+
28
+ [![CI](https://github.com/meetdewey/dewey-haystack/actions/workflows/ci.yml/badge.svg)](https://github.com/meetdewey/dewey-haystack/actions/workflows/ci.yml)
29
+
30
+ [Haystack](https://haystack.deepset.ai/) integration for [Dewey](https://meetdewey.com) — document store, retriever, and research component.
31
+
32
+ ## Installation
33
+
34
+ ```bash
35
+ pip install dewey-haystack
36
+ ```
37
+
38
+ ## Components
39
+
40
+ ### DeweyDocumentStore
41
+
42
+ Haystack DocumentStore backed by a Dewey collection. Handles document upload and deletion; Dewey manages chunking and embeddings automatically.
43
+
44
+ ```python
45
+ from haystack_integrations.document_stores.dewey import DeweyDocumentStore
46
+ from haystack.utils import Secret
47
+
48
+ store = DeweyDocumentStore(
49
+ api_key=Secret.from_env_var("DEWEY_API_KEY"),
50
+ collection_id="3f7a1b2c-...",
51
+ )
52
+ ```
53
+
54
+ Upload Haystack Documents:
55
+
56
+ ```python
57
+ from haystack import Document
58
+
59
+ store.write_documents([
60
+ Document(content="Neural networks learn via backpropagation.", meta={"source": "ml.txt"}),
61
+ Document(content="Transformers use self-attention mechanisms."),
62
+ ])
63
+ ```
64
+
65
+ ### DeweyRetriever
66
+
67
+ Drop-in Haystack retriever backed by Dewey's hybrid semantic + BM25 search.
68
+
69
+ ```python
70
+ from haystack import Pipeline
71
+ from haystack_integrations.document_stores.dewey import DeweyDocumentStore
72
+ from haystack_integrations.components.retrievers.dewey import DeweyRetriever
73
+ from haystack.utils import Secret
74
+
75
+ store = DeweyDocumentStore(
76
+ api_key=Secret.from_env_var("DEWEY_API_KEY"),
77
+ collection_id="3f7a1b2c-...",
78
+ )
79
+
80
+ pipeline = Pipeline()
81
+ pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=8))
82
+
83
+ result = pipeline.run({"retriever": {"query": "What are the key findings?"}})
84
+ for doc in result["retriever"]["documents"]:
85
+ print(f"[{doc.meta['filename']}] {doc.content}")
86
+ ```
87
+
88
+ Each returned `Document` carries citation metadata:
89
+
90
+ | Field | Description |
91
+ |---|---|
92
+ | `score` | Relevance score (0–1) |
93
+ | `document_id` | Dewey document ID |
94
+ | `filename` | Original filename |
95
+ | `section_id` | Section ID |
96
+ | `section_title` | Section heading |
97
+ | `section_level` | Heading depth (1 = top-level) |
98
+
99
+ **RAG pipeline with an LLM:**
100
+
101
+ ```python
102
+ from haystack.components.builders import PromptBuilder
103
+ from haystack.components.generators import OpenAIGenerator
104
+
105
+ prompt_template = """
106
+ Answer the question using only the provided context.
107
+
108
+ Context:
109
+ {% for doc in documents %}
110
+ - {{ doc.content }}
111
+ {% endfor %}
112
+
113
+ Question: {{ query }}
114
+ """
115
+
116
+ pipeline = Pipeline()
117
+ pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=5))
118
+ pipeline.add_component("prompt", PromptBuilder(template=prompt_template))
119
+ pipeline.add_component("llm", OpenAIGenerator(model="gpt-4o-mini"))
120
+
121
+ pipeline.connect("retriever.documents", "prompt.documents")
122
+ pipeline.connect("prompt.prompt", "llm.prompt")
123
+
124
+ result = pipeline.run({
125
+ "retriever": {"query": "What were the main findings?"},
126
+ "prompt": {"query": "What were the main findings?"},
127
+ })
128
+ print(result["llm"]["replies"][0])
129
+ ```
130
+
131
+ ### DeweyResearchComponent
132
+
133
+ A Haystack component that runs Dewey's full agentic research loop — searching, reading, and synthesising across multiple documents — and returns a grounded Markdown answer with cited sources.
134
+
135
+ Use this as a drop-in replacement for an LLM generator when you want Dewey to handle both retrieval *and* generation.
136
+
137
+ ```python
138
+ from haystack import Pipeline
139
+ from haystack_integrations.components.retrievers.dewey import DeweyResearchComponent
140
+ from haystack.utils import Secret
141
+
142
+ pipeline = Pipeline()
143
+ pipeline.add_component(
144
+ "research",
145
+ DeweyResearchComponent(
146
+ api_key=Secret.from_env_var("DEWEY_API_KEY"),
147
+ collection_id="3f7a1b2c-...",
148
+ depth="balanced",
149
+ ),
150
+ )
151
+
152
+ result = pipeline.run({"research": {"query": "What were the key findings across all studies?"}})
153
+ print(result["research"]["answer"])
154
+
155
+ for source in result["research"]["sources"]:
156
+ print(f" [{source.meta['filename']}] {source.content[:80]}...")
157
+ ```
158
+
159
+ **Outputs:**
160
+
161
+ | Key | Type | Description |
162
+ |---|---|---|
163
+ | `answer` | `str` | Synthesised Markdown answer |
164
+ | `sources` | `list[Document]` | Source chunks cited by the answer |
165
+
166
+ **Research depths:**
167
+
168
+ | depth | Speed | Tools | Requires BYOK |
169
+ |---|---|---|---|
170
+ | `quick` | fast | basic search | no |
171
+ | `balanced` | fast | basic search | no |
172
+ | `deep` | slower | full tool suite | yes |
173
+ | `exhaustive` | slowest | full tool suite | yes |
174
+
175
+ `deep` and `exhaustive` require a Dewey Pro plan and a BYOK API key configured on your project.
176
+
177
+ **With a custom model:**
178
+
179
+ ```python
180
+ DeweyResearchComponent(
181
+ api_key=Secret.from_env_var("DEWEY_API_KEY"),
182
+ collection_id="3f7a1b2c-...",
183
+ depth="deep",
184
+ model="claude-sonnet-4-6", # requires Anthropic BYOK key on your project
185
+ )
186
+ ```
187
+
188
+ ## Requirements
189
+
190
+ - Python 3.9+
191
+ - `meetdewey >= 1.0`
192
+ - `haystack-ai >= 2.0`
193
+
194
+ ## Development
195
+
196
+ ```bash
197
+ pip install -e ".[dev]"
198
+ pytest
199
+ ```
@@ -0,0 +1,9 @@
1
+ haystack_integrations/components/retrievers/dewey/__init__.py,sha256=NmjzgaPoebXfq31dse_dH-7VK-jTOJQxnpfs81Fw_T8,277
2
+ haystack_integrations/components/retrievers/dewey/dewey_research_component.py,sha256=WIQ6odb_835QdvWJR6eDulKujoT8_jRLsrrujUpk5kE,6190
3
+ haystack_integrations/components/retrievers/dewey/dewey_retriever.py,sha256=lFPXpNauvT0ot-q5teBS29o5xksMJBQor9HMuU7hNDk,4126
4
+ haystack_integrations/document_stores/dewey/__init__.py,sha256=8i96VIg-8vh1oWZENMMBuZtMJQo4ho-J31k7tJSxU7o,139
5
+ haystack_integrations/document_stores/dewey/dewey_document_store.py,sha256=GGpJ0CTvhQW2Ir522Sfwi4tOVMd1tpYFupyBK0LD9dQ,8391
6
+ dewey_haystack-0.1.0.dist-info/METADATA,sha256=PWDpEDxyt1jyMLjD1GFRLIXn7nBmGb_izxzY0c5zd7I,5901
7
+ dewey_haystack-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ dewey_haystack-0.1.0.dist-info/top_level.txt,sha256=xEV0YMHti-yzzXgkLgSzsILiqsuC91-4ShNpDgsKNRI,22
9
+ dewey_haystack-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ haystack_integrations
@@ -0,0 +1,8 @@
1
+ from haystack_integrations.components.retrievers.dewey.dewey_research_component import (
2
+ DeweyResearchComponent,
3
+ )
4
+ from haystack_integrations.components.retrievers.dewey.dewey_retriever import (
5
+ DeweyRetriever,
6
+ )
7
+
8
+ __all__ = ["DeweyRetriever", "DeweyResearchComponent"]
@@ -0,0 +1,162 @@
1
+ """DeweyResearchComponent — Haystack component that runs a full Dewey research query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from haystack import Document, component, default_from_dict, default_to_dict
8
+ from haystack.utils import Secret, deserialize_secrets_inplace
9
+
10
+
11
+ @component
12
+ class DeweyResearchComponent:
13
+ """Haystack component that performs agentic research over a Dewey collection.
14
+
15
+ Unlike :class:`DeweyRetriever` (single-shot search), this component runs
16
+ Dewey's full multi-step research loop: it searches, reads, and synthesises
17
+ content across multiple documents, returning a grounded Markdown answer and
18
+ the source chunks it cited.
19
+
20
+ This component is a drop-in replacement for an LLM generator in a Haystack
21
+ pipeline when you want Dewey to handle both retrieval *and* generation.
22
+
23
+ Example — research pipeline::
24
+
25
+ from haystack import Pipeline
26
+ from haystack_integrations.components.retrievers.dewey import DeweyResearchComponent
27
+ from haystack.utils import Secret
28
+
29
+ pipeline = Pipeline()
30
+ pipeline.add_component(
31
+ "research",
32
+ DeweyResearchComponent(
33
+ api_key=Secret.from_env_var("DEWEY_API_KEY"),
34
+ collection_id="3f7a1b2c-...",
35
+ depth="balanced",
36
+ ),
37
+ )
38
+
39
+ result = pipeline.run({"research": {"query": "What were the key findings?"}})
40
+ print(result["research"]["answer"])
41
+ for source in result["research"]["sources"]:
42
+ print(f" [{source.meta['filename']}] {source.content[:80]}...")
43
+
44
+ Research depths:
45
+
46
+ +-------------+------+-------+-------+
47
+ | depth | fast | tools | BYOK |
48
+ +=============+======+=======+=======+
49
+ | quick | yes | basic | no |
50
+ +-------------+------+-------+-------+
51
+ | balanced | yes | basic | no |
52
+ +-------------+------+-------+-------+
53
+ | deep | no | full | yes |
54
+ +-------------+------+-------+-------+
55
+ | exhaustive | no | full | yes |
56
+ +-------------+------+-------+-------+
57
+
58
+ ``deep`` and ``exhaustive`` depths require a Pro plan and a BYOK API key
59
+ configured on your Dewey project.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ api_key: Secret = Secret.from_env_var("DEWEY_API_KEY"), # noqa: B008
65
+ collection_id: str = "",
66
+ depth: str = "balanced",
67
+ model: Optional[str] = None,
68
+ base_url: str = "https://api.meetdewey.com/v1",
69
+ ) -> None:
70
+ """
71
+ Args:
72
+ api_key: Dewey project API key. Defaults to the ``DEWEY_API_KEY``
73
+ environment variable.
74
+ collection_id: ID of the Dewey collection to research.
75
+ depth: Research depth — ``"quick"``, ``"balanced"`` (default),
76
+ ``"deep"``, or ``"exhaustive"``.
77
+ model: Optional model override (e.g. ``"gpt-4o"``,
78
+ ``"claude-sonnet-4-6"``). Defaults to Dewey's server-side
79
+ default for the chosen depth.
80
+ base_url: Dewey API base URL. Override for local development.
81
+ """
82
+ if depth not in ("quick", "balanced", "deep", "exhaustive"):
83
+ raise ValueError(
84
+ f"Invalid depth {depth!r}. Must be one of: quick, balanced, deep, exhaustive"
85
+ )
86
+ self.api_key = api_key
87
+ self.collection_id = collection_id
88
+ self.depth = depth
89
+ self.model = model
90
+ self.base_url = base_url
91
+ self._client: Any = None
92
+
93
+ def _get_client(self) -> Any:
94
+ if self._client is None:
95
+ from dewey import DeweyClient
96
+
97
+ self._client = DeweyClient(
98
+ api_key=self.api_key.resolve_value(), base_url=self.base_url
99
+ )
100
+ return self._client
101
+
102
+ @component.output_types(answer=str, sources=List[Document])
103
+ def run(self, query: str) -> Dict[str, Any]:
104
+ """Run a research query against the collection.
105
+
106
+ Streams the Dewey research SSE endpoint internally and returns the
107
+ completed answer once the stream closes.
108
+
109
+ Args:
110
+ query: The research question.
111
+
112
+ Returns:
113
+ A dict with:
114
+ - ``"answer"`` (``str``): The synthesised Markdown answer.
115
+ - ``"sources"`` (``list[Document]``): Source chunks cited, each
116
+ with ``content`` and metadata (``filename``, ``document_id``,
117
+ ``section_id``, ``section_title``, ``section_level``).
118
+ """
119
+ client = self._get_client()
120
+ kwargs: Dict[str, Any] = {"depth": self.depth}
121
+ if self.model:
122
+ kwargs["model"] = self.model
123
+
124
+ answer_parts: List[str] = []
125
+ sources: List[Document] = []
126
+
127
+ for event in client.research.stream(self.collection_id, query, **kwargs):
128
+ if event.type == "chunk":
129
+ answer_parts.append(event.content)
130
+ elif event.type == "done":
131
+ sources = [
132
+ Document(
133
+ content=s.content,
134
+ meta={
135
+ "document_id": s.documentId,
136
+ "filename": s.filename,
137
+ "section_id": s.sectionId,
138
+ "section_title": s.sectionTitle,
139
+ "section_level": s.sectionLevel,
140
+ },
141
+ )
142
+ for s in (getattr(event, "sources", None) or [])
143
+ ]
144
+ elif event.type == "error":
145
+ raise RuntimeError(f"Dewey research error: {event.message}")
146
+
147
+ return {"answer": "".join(answer_parts), "sources": sources}
148
+
149
+ def to_dict(self) -> Dict[str, Any]:
150
+ return default_to_dict(
151
+ self,
152
+ api_key=self.api_key.to_dict(),
153
+ collection_id=self.collection_id,
154
+ depth=self.depth,
155
+ model=self.model,
156
+ base_url=self.base_url,
157
+ )
158
+
159
+ @classmethod
160
+ def from_dict(cls, data: Dict[str, Any]) -> "DeweyResearchComponent":
161
+ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
162
+ return default_from_dict(cls, data)
@@ -0,0 +1,116 @@
1
+ """DeweyRetriever — Haystack component backed by Dewey hybrid search."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from haystack import Document, component, default_from_dict, default_to_dict
8
+
9
+ from haystack_integrations.document_stores.dewey import DeweyDocumentStore
10
+
11
+
12
+ @component
13
+ class DeweyRetriever:
14
+ """Haystack retriever that queries a Dewey collection.
15
+
16
+ Uses Dewey's hybrid semantic + BM25 search and returns chunks as Haystack
17
+ ``Document`` objects with full citation metadata.
18
+
19
+ Must be paired with a :class:`~haystack_integrations.document_stores.dewey.DeweyDocumentStore`.
20
+
21
+ Example — standalone retrieval pipeline::
22
+
23
+ from haystack import Pipeline
24
+ from haystack_integrations.document_stores.dewey import DeweyDocumentStore
25
+ from haystack_integrations.components.retrievers.dewey import DeweyRetriever
26
+ from haystack.utils import Secret
27
+
28
+ store = DeweyDocumentStore(
29
+ api_key=Secret.from_env_var("DEWEY_API_KEY"),
30
+ collection_id="3f7a1b2c-...",
31
+ )
32
+
33
+ pipeline = Pipeline()
34
+ pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=8))
35
+
36
+ result = pipeline.run({"retriever": {"query": "What are the key findings?"}})
37
+ for doc in result["retriever"]["documents"]:
38
+ print(doc.content, doc.meta["score"])
39
+
40
+ Example — RAG pipeline with an LLM::
41
+
42
+ from haystack.components.builders import PromptBuilder
43
+ from haystack.components.generators import OpenAIGenerator
44
+
45
+ prompt_template = \"\"\"
46
+ Answer the question using the provided context.
47
+ Context:
48
+ {% for doc in documents %}{{ doc.content }}{% endfor %}
49
+ Question: {{ query }}
50
+ \"\"\"
51
+
52
+ pipeline = Pipeline()
53
+ pipeline.add_component("retriever", DeweyRetriever(document_store=store, top_k=5))
54
+ pipeline.add_component("prompt", PromptBuilder(template=prompt_template))
55
+ pipeline.add_component("llm", OpenAIGenerator(model="gpt-4o-mini"))
56
+
57
+ pipeline.connect("retriever.documents", "prompt.documents")
58
+ pipeline.connect("prompt.prompt", "llm.prompt")
59
+
60
+ result = pipeline.run({
61
+ "retriever": {"query": "summarise the main themes"},
62
+ "prompt": {"query": "summarise the main themes"},
63
+ })
64
+ print(result["llm"]["replies"][0])
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ document_store: DeweyDocumentStore,
70
+ top_k: int = 10,
71
+ filters: Optional[Dict[str, Any]] = None,
72
+ ) -> None:
73
+ """
74
+ Args:
75
+ document_store: A :class:`DeweyDocumentStore` instance.
76
+ top_k: Maximum number of chunks to return (1–50).
77
+ filters: Unused — present for interface compatibility. Dewey does
78
+ not support metadata filtering at retrieval time.
79
+ """
80
+ self.document_store = document_store
81
+ self.top_k = top_k
82
+ self.filters = filters
83
+
84
+ @component.output_types(documents=List[Document])
85
+ def run(
86
+ self,
87
+ query: str,
88
+ top_k: Optional[int] = None,
89
+ ) -> Dict[str, List[Document]]:
90
+ """Run the retriever.
91
+
92
+ Args:
93
+ query: The search query.
94
+ top_k: Override the instance-level ``top_k`` for this call.
95
+
96
+ Returns:
97
+ A dict with a single key ``"documents"`` containing a list of
98
+ :class:`~haystack.Document` objects ordered by relevance score.
99
+ """
100
+ k = top_k if top_k is not None else self.top_k
101
+ documents = self.document_store._search(query, k)
102
+ return {"documents": documents}
103
+
104
+ def to_dict(self) -> Dict[str, Any]:
105
+ return default_to_dict(
106
+ self,
107
+ document_store=self.document_store.to_dict(),
108
+ top_k=self.top_k,
109
+ )
110
+
111
+ @classmethod
112
+ def from_dict(cls, data: Dict[str, Any]) -> "DeweyRetriever":
113
+ data["init_parameters"]["document_store"] = DeweyDocumentStore.from_dict(
114
+ data["init_parameters"]["document_store"]
115
+ )
116
+ return default_from_dict(cls, data)
@@ -0,0 +1,5 @@
1
+ from haystack_integrations.document_stores.dewey.dewey_document_store import (
2
+ DeweyDocumentStore,
3
+ )
4
+
5
+ __all__ = ["DeweyDocumentStore"]
@@ -0,0 +1,210 @@
1
+ """DeweyDocumentStore — Haystack DocumentStore backed by a Dewey collection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import time
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ from haystack import Document, default_from_dict, default_to_dict
10
+ from haystack.document_stores.types import DocumentStore, DuplicatePolicy
11
+ from haystack.utils import Secret, deserialize_secrets_inplace
12
+
13
+
14
+ class DeweyDocumentStore:
15
+ """Haystack DocumentStore backed by a Dewey collection.
16
+
17
+ Dewey manages its own chunking and embeddings. Each Haystack ``Document``
18
+ written to the store is uploaded as a plain-text file and processed by
19
+ Dewey's ingestion pipeline. Use :class:`DeweyRetriever` to query the
20
+ store — ``filter_documents`` is intentionally limited (see below).
21
+
22
+ Example::
23
+
24
+ from haystack_integrations.document_stores.dewey import DeweyDocumentStore
25
+ from haystack_integrations.components.retrievers.dewey import DeweyRetriever
26
+ from haystack.utils import Secret
27
+
28
+ store = DeweyDocumentStore(
29
+ api_key=Secret.from_env_var("DEWEY_API_KEY"),
30
+ collection_id="3f7a1b2c-...",
31
+ )
32
+ retriever = DeweyRetriever(document_store=store, top_k=8)
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ api_key: Secret = Secret.from_env_var("DEWEY_API_KEY"), # noqa: B008
38
+ collection_id: str = "",
39
+ base_url: str = "https://api.meetdewey.com/v1",
40
+ poll_interval: float = 2.0,
41
+ poll_timeout: float = 300.0,
42
+ ) -> None:
43
+ """
44
+ Args:
45
+ api_key: Dewey project API key. Defaults to the ``DEWEY_API_KEY``
46
+ environment variable.
47
+ collection_id: ID of the Dewey collection to use.
48
+ base_url: Dewey API base URL. Override for local development.
49
+ poll_interval: Seconds between status polls in ``write_documents``.
50
+ poll_timeout: Maximum seconds to wait for documents to be ready.
51
+ """
52
+ self.api_key = api_key
53
+ self.collection_id = collection_id
54
+ self.base_url = base_url
55
+ self.poll_interval = poll_interval
56
+ self.poll_timeout = poll_timeout
57
+ self._client: Any = None
58
+
59
+ def _get_client(self) -> Any:
60
+ if self._client is None:
61
+ from dewey import DeweyClient
62
+
63
+ self._client = DeweyClient(
64
+ api_key=self.api_key.resolve_value(), base_url=self.base_url
65
+ )
66
+ return self._client
67
+
68
+ # ── DocumentStore protocol ────────────────────────────────────────────────
69
+
70
+ def count_documents(self) -> int:
71
+ """Return the number of documents (files) in the collection."""
72
+ client = self._get_client()
73
+ docs = client.documents.list(self.collection_id)
74
+ return len(docs)
75
+
76
+ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]:
77
+ """Return documents matching ``filters``.
78
+
79
+ Dewey does not support arbitrary metadata filtering. Passing ``filters``
80
+ will raise ``NotImplementedError``. Use :class:`DeweyRetriever` for
81
+ semantic search instead.
82
+
83
+ Calling with ``filters=None`` returns an empty list — use
84
+ ``count_documents`` to check collection size, and ``DeweyRetriever``
85
+ to retrieve content.
86
+ """
87
+ if filters is not None:
88
+ raise NotImplementedError(
89
+ "DeweyDocumentStore does not support metadata filters. "
90
+ "Use DeweyRetriever for semantic search."
91
+ )
92
+ return []
93
+
94
+ def write_documents(
95
+ self,
96
+ documents: List[Document],
97
+ policy: DuplicatePolicy = DuplicatePolicy.NONE,
98
+ ) -> int:
99
+ """Upload documents to the Dewey collection and wait for them to be ready.
100
+
101
+ Each Haystack ``Document`` is uploaded as a ``.txt`` file. The document's
102
+ ``id`` is used as the filename when no ``"source"`` key is present in
103
+ ``meta``. Dewey handles chunking and embedding automatically.
104
+
105
+ Args:
106
+ documents: Haystack Documents to upload. Each must have non-empty
107
+ ``content``.
108
+ policy: Duplicate handling policy. Only ``NONE`` and ``OVERWRITE``
109
+ are meaningful — Dewey deduplicates by content hash.
110
+
111
+ Returns:
112
+ Number of documents successfully submitted for ingestion.
113
+ """
114
+ client = self._get_client()
115
+ uploaded_ids: List[str] = []
116
+
117
+ for doc in documents:
118
+ if not doc.content:
119
+ continue
120
+ source = doc.meta.get("source") or doc.meta.get("filename") or f"{doc.id}.txt"
121
+ if "." not in source.split("/")[-1]:
122
+ source = f"{source}.txt"
123
+
124
+ uploaded = client.documents.upload(
125
+ self.collection_id,
126
+ io.BytesIO(doc.content.encode("utf-8")),
127
+ filename=source,
128
+ content_type="text/plain",
129
+ )
130
+ uploaded_ids.append(uploaded.id)
131
+
132
+ self._wait_for_ready(uploaded_ids)
133
+ return len(uploaded_ids)
134
+
135
+ def delete_documents(self, document_ids: List[str]) -> None:
136
+ """Delete documents from the collection by their Dewey document IDs."""
137
+ client = self._get_client()
138
+ for doc_id in document_ids:
139
+ client.documents.delete(self.collection_id, doc_id)
140
+
141
+ # ── Search (used by DeweyRetriever) ───────────────────────────────────────
142
+
143
+ def _wait_for_ready(self, ids: List[str]) -> None:
144
+ """Poll until all documents in ``ids`` reach ``ready`` status."""
145
+ client = self._get_client()
146
+ pending = set(ids)
147
+ deadline = time.monotonic() + self.poll_timeout
148
+
149
+ while pending and time.monotonic() < deadline:
150
+ still_pending: set[str] = set()
151
+ for doc_id in pending:
152
+ doc = client.documents.get(self.collection_id, doc_id)
153
+ if doc.status == "error":
154
+ raise RuntimeError(
155
+ f"Document {doc_id} failed to process: {getattr(doc, 'errorMessage', 'unknown error')}"
156
+ )
157
+ if doc.status != "ready":
158
+ still_pending.add(doc_id)
159
+ pending = still_pending
160
+ if pending:
161
+ time.sleep(self.poll_interval)
162
+
163
+ if pending:
164
+ raise TimeoutError(
165
+ f"{len(pending)} document(s) not ready after {self.poll_timeout}s"
166
+ )
167
+
168
+ def _search(self, query: str, top_k: int) -> List[Document]:
169
+ """Run a hybrid search and return Haystack Documents with metadata."""
170
+ client = self._get_client()
171
+ results = client.retrieval.query(self.collection_id, query, limit=top_k)
172
+ return [_result_to_document(r) for r in results]
173
+
174
+ # ── Serialization ─────────────────────────────────────────────────────────
175
+
176
+ def to_dict(self) -> Dict[str, Any]:
177
+ return default_to_dict(
178
+ self,
179
+ api_key=self.api_key.to_dict(),
180
+ collection_id=self.collection_id,
181
+ base_url=self.base_url,
182
+ poll_interval=self.poll_interval,
183
+ poll_timeout=self.poll_timeout,
184
+ )
185
+
186
+ @classmethod
187
+ def from_dict(cls, data: Dict[str, Any]) -> "DeweyDocumentStore":
188
+ deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
189
+ return default_from_dict(cls, data)
190
+
191
+
192
+ # ── Helpers ───────────────────────────────────────────────────────────────────
193
+
194
+
195
+ def _result_to_document(r: Any) -> Document:
196
+ return Document(
197
+ content=r.chunk.content,
198
+ meta={
199
+ "score": r.score,
200
+ "document_id": r.document.id,
201
+ "filename": r.document.filename,
202
+ "section_id": r.section.id,
203
+ "section_title": r.section.title,
204
+ "section_level": r.section.level,
205
+ },
206
+ )
207
+
208
+
209
+ # Register the protocol compliance check at import time
210
+ DocumentStore.register(DeweyDocumentStore) # type: ignore[attr-defined]