haiku.rag-slim 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- haiku/rag/__init__.py +0 -0
- haiku/rag/app.py +542 -0
- haiku/rag/chunker.py +65 -0
- haiku/rag/cli.py +466 -0
- haiku/rag/client.py +731 -0
- haiku/rag/config/__init__.py +74 -0
- haiku/rag/config/loader.py +94 -0
- haiku/rag/config/models.py +99 -0
- haiku/rag/embeddings/__init__.py +49 -0
- haiku/rag/embeddings/base.py +25 -0
- haiku/rag/embeddings/ollama.py +28 -0
- haiku/rag/embeddings/openai.py +26 -0
- haiku/rag/embeddings/vllm.py +29 -0
- haiku/rag/embeddings/voyageai.py +27 -0
- haiku/rag/graph/__init__.py +26 -0
- haiku/rag/graph/agui/__init__.py +53 -0
- haiku/rag/graph/agui/cli_renderer.py +135 -0
- haiku/rag/graph/agui/emitter.py +197 -0
- haiku/rag/graph/agui/events.py +254 -0
- haiku/rag/graph/agui/server.py +310 -0
- haiku/rag/graph/agui/state.py +34 -0
- haiku/rag/graph/agui/stream.py +86 -0
- haiku/rag/graph/common/__init__.py +5 -0
- haiku/rag/graph/common/models.py +42 -0
- haiku/rag/graph/common/nodes.py +265 -0
- haiku/rag/graph/common/prompts.py +46 -0
- haiku/rag/graph/common/utils.py +44 -0
- haiku/rag/graph/deep_qa/__init__.py +1 -0
- haiku/rag/graph/deep_qa/dependencies.py +27 -0
- haiku/rag/graph/deep_qa/graph.py +243 -0
- haiku/rag/graph/deep_qa/models.py +20 -0
- haiku/rag/graph/deep_qa/prompts.py +59 -0
- haiku/rag/graph/deep_qa/state.py +56 -0
- haiku/rag/graph/research/__init__.py +3 -0
- haiku/rag/graph/research/common.py +87 -0
- haiku/rag/graph/research/dependencies.py +151 -0
- haiku/rag/graph/research/graph.py +295 -0
- haiku/rag/graph/research/models.py +166 -0
- haiku/rag/graph/research/prompts.py +107 -0
- haiku/rag/graph/research/state.py +85 -0
- haiku/rag/logging.py +56 -0
- haiku/rag/mcp.py +245 -0
- haiku/rag/monitor.py +194 -0
- haiku/rag/qa/__init__.py +33 -0
- haiku/rag/qa/agent.py +93 -0
- haiku/rag/qa/prompts.py +60 -0
- haiku/rag/reader.py +135 -0
- haiku/rag/reranking/__init__.py +63 -0
- haiku/rag/reranking/base.py +13 -0
- haiku/rag/reranking/cohere.py +34 -0
- haiku/rag/reranking/mxbai.py +28 -0
- haiku/rag/reranking/vllm.py +44 -0
- haiku/rag/reranking/zeroentropy.py +59 -0
- haiku/rag/store/__init__.py +4 -0
- haiku/rag/store/engine.py +309 -0
- haiku/rag/store/models/__init__.py +4 -0
- haiku/rag/store/models/chunk.py +17 -0
- haiku/rag/store/models/document.py +17 -0
- haiku/rag/store/repositories/__init__.py +9 -0
- haiku/rag/store/repositories/chunk.py +442 -0
- haiku/rag/store/repositories/document.py +261 -0
- haiku/rag/store/repositories/settings.py +165 -0
- haiku/rag/store/upgrades/__init__.py +62 -0
- haiku/rag/store/upgrades/v0_10_1.py +64 -0
- haiku/rag/store/upgrades/v0_9_3.py +112 -0
- haiku/rag/utils.py +211 -0
- haiku_rag_slim-0.16.0.dist-info/METADATA +128 -0
- haiku_rag_slim-0.16.0.dist-info/RECORD +71 -0
- haiku_rag_slim-0.16.0.dist-info/WHEEL +4 -0
- haiku_rag_slim-0.16.0.dist-info/entry_points.txt +2 -0
- haiku_rag_slim-0.16.0.dist-info/licenses/LICENSE +7 -0
haiku/rag/client.py
ADDED
|
@@ -0,0 +1,731 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
import mimetypes
|
|
4
|
+
import tempfile
|
|
5
|
+
from collections.abc import AsyncGenerator
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
from haiku.rag.config import AppConfig, Config
|
|
12
|
+
from haiku.rag.reranking import get_reranker
|
|
13
|
+
from haiku.rag.store.engine import Store
|
|
14
|
+
from haiku.rag.store.models.chunk import Chunk
|
|
15
|
+
from haiku.rag.store.models.document import Document
|
|
16
|
+
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
17
|
+
from haiku.rag.store.repositories.document import DocumentRepository
|
|
18
|
+
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HaikuRAG:
|
|
24
|
+
"""High-level haiku-rag client."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
db_path: Path | None = None,
|
|
29
|
+
config: AppConfig = Config,
|
|
30
|
+
skip_validation: bool = False,
|
|
31
|
+
allow_create: bool = True,
|
|
32
|
+
):
|
|
33
|
+
"""Initialize the RAG client with a database path.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
db_path: Path to the database file. If None, uses config.storage.data_dir.
|
|
37
|
+
config: Configuration to use. Defaults to global Config.
|
|
38
|
+
skip_validation: Whether to skip configuration validation on database load.
|
|
39
|
+
allow_create: Whether to allow database creation. If False, will raise error
|
|
40
|
+
if database doesn't exist (for read operations).
|
|
41
|
+
"""
|
|
42
|
+
self._config = config
|
|
43
|
+
if db_path is None:
|
|
44
|
+
db_path = self._config.storage.data_dir / "haiku.rag.lancedb"
|
|
45
|
+
self.store = Store(
|
|
46
|
+
db_path,
|
|
47
|
+
config=self._config,
|
|
48
|
+
skip_validation=skip_validation,
|
|
49
|
+
allow_create=allow_create,
|
|
50
|
+
)
|
|
51
|
+
self.document_repository = DocumentRepository(self.store)
|
|
52
|
+
self.chunk_repository = ChunkRepository(self.store)
|
|
53
|
+
|
|
54
|
+
async def __aenter__(self):
|
|
55
|
+
"""Async context manager entry."""
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb): # noqa: ARG002
|
|
59
|
+
"""Async context manager exit."""
|
|
60
|
+
# Wait for any pending vacuum to complete before closing
|
|
61
|
+
async with self.store._vacuum_lock:
|
|
62
|
+
pass
|
|
63
|
+
self.close()
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
async def _create_document_with_docling(
|
|
67
|
+
self,
|
|
68
|
+
docling_document,
|
|
69
|
+
uri: str | None = None,
|
|
70
|
+
title: str | None = None,
|
|
71
|
+
metadata: dict | None = None,
|
|
72
|
+
chunks: list[Chunk] | None = None,
|
|
73
|
+
) -> Document:
|
|
74
|
+
"""Create a new document from DoclingDocument."""
|
|
75
|
+
content = docling_document.export_to_markdown()
|
|
76
|
+
document = Document(
|
|
77
|
+
content=content,
|
|
78
|
+
uri=uri,
|
|
79
|
+
title=title,
|
|
80
|
+
metadata=metadata or {},
|
|
81
|
+
)
|
|
82
|
+
return await self.document_repository._create_and_chunk(
|
|
83
|
+
document, docling_document, chunks
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
async def create_document(
|
|
87
|
+
self,
|
|
88
|
+
content: str,
|
|
89
|
+
uri: str | None = None,
|
|
90
|
+
title: str | None = None,
|
|
91
|
+
metadata: dict | None = None,
|
|
92
|
+
chunks: list[Chunk] | None = None,
|
|
93
|
+
) -> Document:
|
|
94
|
+
"""Create a new document with optional URI and metadata.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
content: The text content of the document.
|
|
98
|
+
uri: Optional URI identifier for the document.
|
|
99
|
+
metadata: Optional metadata dictionary.
|
|
100
|
+
chunks: Optional list of pre-created chunks to use instead of generating new ones.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
The created Document instance.
|
|
104
|
+
"""
|
|
105
|
+
document = Document(
|
|
106
|
+
content=content,
|
|
107
|
+
uri=uri,
|
|
108
|
+
title=title,
|
|
109
|
+
metadata=metadata or {},
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Only create docling_document if we need to generate chunks
|
|
113
|
+
if chunks is None:
|
|
114
|
+
# Lazy import to avoid loading docling
|
|
115
|
+
from haiku.rag.utils import text_to_docling_document
|
|
116
|
+
|
|
117
|
+
docling_document = text_to_docling_document(content)
|
|
118
|
+
else:
|
|
119
|
+
# Chunks already provided, no conversion needed
|
|
120
|
+
docling_document = None
|
|
121
|
+
|
|
122
|
+
return await self.document_repository._create_and_chunk(
|
|
123
|
+
document, docling_document, chunks
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
async def create_document_from_source(
|
|
127
|
+
self, source: str | Path, title: str | None = None, metadata: dict | None = None
|
|
128
|
+
) -> Document | list[Document]:
|
|
129
|
+
"""Create or update document(s) from a file path, directory, or URL.
|
|
130
|
+
|
|
131
|
+
Checks if a document with the same URI already exists:
|
|
132
|
+
- If MD5 is unchanged, returns existing document
|
|
133
|
+
- If MD5 changed, updates the document
|
|
134
|
+
- If no document exists, creates a new one
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
source: File path, directory (as string or Path), or URL to parse
|
|
138
|
+
title: Optional title (only used for single files, not directories)
|
|
139
|
+
metadata: Optional metadata dictionary
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
Document instance (created, updated, or existing) for single files/URLs
|
|
143
|
+
List of Document instances for directories
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
ValueError: If the file/URL cannot be parsed or doesn't exist
|
|
147
|
+
httpx.RequestError: If URL request fails
|
|
148
|
+
"""
|
|
149
|
+
# Normalize metadata
|
|
150
|
+
metadata = metadata or {}
|
|
151
|
+
|
|
152
|
+
# Check if it's a URL
|
|
153
|
+
source_str = str(source)
|
|
154
|
+
parsed_url = urlparse(source_str)
|
|
155
|
+
if parsed_url.scheme in ("http", "https"):
|
|
156
|
+
return await self._create_or_update_document_from_url(
|
|
157
|
+
source_str, title=title, metadata=metadata
|
|
158
|
+
)
|
|
159
|
+
elif parsed_url.scheme == "file":
|
|
160
|
+
# Handle file:// URI by converting to path
|
|
161
|
+
source_path = Path(parsed_url.path)
|
|
162
|
+
else:
|
|
163
|
+
# Handle as regular file path
|
|
164
|
+
source_path = Path(source) if isinstance(source, str) else source
|
|
165
|
+
|
|
166
|
+
# Handle directories
|
|
167
|
+
if source_path.is_dir():
|
|
168
|
+
from haiku.rag.monitor import FileFilter
|
|
169
|
+
|
|
170
|
+
documents = []
|
|
171
|
+
filter = FileFilter(
|
|
172
|
+
ignore_patterns=self._config.monitor.ignore_patterns or None,
|
|
173
|
+
include_patterns=self._config.monitor.include_patterns or None,
|
|
174
|
+
)
|
|
175
|
+
for path in source_path.rglob("*"):
|
|
176
|
+
if path.is_file() and filter.include_file(str(path)):
|
|
177
|
+
doc = await self._create_document_from_file(
|
|
178
|
+
path, title=None, metadata=metadata
|
|
179
|
+
)
|
|
180
|
+
documents.append(doc)
|
|
181
|
+
return documents
|
|
182
|
+
|
|
183
|
+
# Handle single file
|
|
184
|
+
return await self._create_document_from_file(
|
|
185
|
+
source_path, title=title, metadata=metadata
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
async def _create_document_from_file(
|
|
189
|
+
self, source_path: Path, title: str | None = None, metadata: dict | None = None
|
|
190
|
+
) -> Document:
|
|
191
|
+
"""Create or update a document from a single file path.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
source_path: Path to the file
|
|
195
|
+
title: Optional title
|
|
196
|
+
metadata: Optional metadata dictionary
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Document instance (created, updated, or existing)
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
ValueError: If the file cannot be parsed or doesn't exist
|
|
203
|
+
"""
|
|
204
|
+
# Lazy import to avoid loading docling
|
|
205
|
+
from haiku.rag.reader import FileReader
|
|
206
|
+
|
|
207
|
+
metadata = metadata or {}
|
|
208
|
+
|
|
209
|
+
if source_path.suffix.lower() not in FileReader.extensions:
|
|
210
|
+
raise ValueError(f"Unsupported file extension: {source_path.suffix}")
|
|
211
|
+
|
|
212
|
+
if not source_path.exists():
|
|
213
|
+
raise ValueError(f"File does not exist: {source_path}")
|
|
214
|
+
|
|
215
|
+
uri = source_path.absolute().as_uri()
|
|
216
|
+
md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
|
|
217
|
+
|
|
218
|
+
# Get content type from file extension (do before early return)
|
|
219
|
+
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
220
|
+
if not content_type:
|
|
221
|
+
content_type = "application/octet-stream"
|
|
222
|
+
# Merge metadata with contentType and md5
|
|
223
|
+
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
224
|
+
|
|
225
|
+
# Check if document already exists
|
|
226
|
+
existing_doc = await self.get_document_by_uri(uri)
|
|
227
|
+
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
228
|
+
# MD5 unchanged; update title/metadata if provided
|
|
229
|
+
updated = False
|
|
230
|
+
if title is not None and title != existing_doc.title:
|
|
231
|
+
existing_doc.title = title
|
|
232
|
+
updated = True
|
|
233
|
+
|
|
234
|
+
# Check if metadata actually changed (beyond contentType and md5)
|
|
235
|
+
merged_metadata = {**(existing_doc.metadata or {}), **metadata}
|
|
236
|
+
if merged_metadata != existing_doc.metadata:
|
|
237
|
+
existing_doc.metadata = merged_metadata
|
|
238
|
+
updated = True
|
|
239
|
+
|
|
240
|
+
if updated:
|
|
241
|
+
return await self.document_repository.update(existing_doc)
|
|
242
|
+
return existing_doc
|
|
243
|
+
|
|
244
|
+
# Parse file only when content changed or new document
|
|
245
|
+
docling_document = FileReader.parse_file(source_path)
|
|
246
|
+
|
|
247
|
+
if existing_doc:
|
|
248
|
+
# Update existing document
|
|
249
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
250
|
+
existing_doc.metadata = metadata
|
|
251
|
+
if title is not None:
|
|
252
|
+
existing_doc.title = title
|
|
253
|
+
return await self.document_repository._update_and_rechunk(
|
|
254
|
+
existing_doc, docling_document
|
|
255
|
+
)
|
|
256
|
+
else:
|
|
257
|
+
# Create new document using DoclingDocument
|
|
258
|
+
return await self._create_document_with_docling(
|
|
259
|
+
docling_document=docling_document,
|
|
260
|
+
uri=uri,
|
|
261
|
+
title=title,
|
|
262
|
+
metadata=metadata,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
async def _create_or_update_document_from_url(
|
|
266
|
+
self, url: str, title: str | None = None, metadata: dict | None = None
|
|
267
|
+
) -> Document:
|
|
268
|
+
"""Create or update a document from a URL by downloading and parsing the content.
|
|
269
|
+
|
|
270
|
+
Checks if a document with the same URI already exists:
|
|
271
|
+
- If MD5 is unchanged, returns existing document
|
|
272
|
+
- If MD5 changed, updates the document
|
|
273
|
+
- If no document exists, creates a new one
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
url: URL to download and parse
|
|
277
|
+
metadata: Optional metadata dictionary
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Document instance (created, updated, or existing)
|
|
281
|
+
|
|
282
|
+
Raises:
|
|
283
|
+
ValueError: If the content cannot be parsed
|
|
284
|
+
httpx.RequestError: If URL request fails
|
|
285
|
+
"""
|
|
286
|
+
# Lazy import to avoid loading docling
|
|
287
|
+
from haiku.rag.reader import FileReader
|
|
288
|
+
|
|
289
|
+
metadata = metadata or {}
|
|
290
|
+
|
|
291
|
+
async with httpx.AsyncClient() as client:
|
|
292
|
+
response = await client.get(url)
|
|
293
|
+
response.raise_for_status()
|
|
294
|
+
|
|
295
|
+
md5_hash = hashlib.md5(response.content).hexdigest()
|
|
296
|
+
|
|
297
|
+
# Get content type early (used for potential no-op update)
|
|
298
|
+
content_type = response.headers.get("content-type", "").lower()
|
|
299
|
+
|
|
300
|
+
# Check if document already exists
|
|
301
|
+
existing_doc = await self.get_document_by_uri(url)
|
|
302
|
+
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
303
|
+
# MD5 unchanged; update title/metadata if provided
|
|
304
|
+
updated = False
|
|
305
|
+
if title is not None and title != existing_doc.title:
|
|
306
|
+
existing_doc.title = title
|
|
307
|
+
updated = True
|
|
308
|
+
|
|
309
|
+
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
310
|
+
# Check if metadata actually changed (beyond contentType and md5)
|
|
311
|
+
merged_metadata = {**(existing_doc.metadata or {}), **metadata}
|
|
312
|
+
if merged_metadata != existing_doc.metadata:
|
|
313
|
+
existing_doc.metadata = merged_metadata
|
|
314
|
+
updated = True
|
|
315
|
+
|
|
316
|
+
if updated:
|
|
317
|
+
return await self.document_repository.update(existing_doc)
|
|
318
|
+
return existing_doc
|
|
319
|
+
file_extension = self._get_extension_from_content_type_or_url(
|
|
320
|
+
url, content_type
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
if file_extension not in FileReader.extensions:
|
|
324
|
+
raise ValueError(
|
|
325
|
+
f"Unsupported content type/extension: {content_type}/{file_extension}"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Create a temporary file with the appropriate extension
|
|
329
|
+
with tempfile.NamedTemporaryFile(
|
|
330
|
+
mode="wb", suffix=file_extension
|
|
331
|
+
) as temp_file:
|
|
332
|
+
temp_file.write(response.content)
|
|
333
|
+
temp_file.flush() # Ensure content is written to disk
|
|
334
|
+
temp_path = Path(temp_file.name)
|
|
335
|
+
|
|
336
|
+
# Parse the content using FileReader
|
|
337
|
+
docling_document = FileReader.parse_file(temp_path)
|
|
338
|
+
|
|
339
|
+
# Merge metadata with contentType and md5
|
|
340
|
+
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
341
|
+
|
|
342
|
+
if existing_doc:
|
|
343
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
344
|
+
existing_doc.metadata = metadata
|
|
345
|
+
if title is not None:
|
|
346
|
+
existing_doc.title = title
|
|
347
|
+
return await self.document_repository._update_and_rechunk(
|
|
348
|
+
existing_doc, docling_document
|
|
349
|
+
)
|
|
350
|
+
else:
|
|
351
|
+
return await self._create_document_with_docling(
|
|
352
|
+
docling_document=docling_document,
|
|
353
|
+
uri=url,
|
|
354
|
+
title=title,
|
|
355
|
+
metadata=metadata,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
def _get_extension_from_content_type_or_url(
|
|
359
|
+
self, url: str, content_type: str
|
|
360
|
+
) -> str:
|
|
361
|
+
"""Determine file extension from content type or URL."""
|
|
362
|
+
# Common content type mappings
|
|
363
|
+
content_type_map = {
|
|
364
|
+
"text/html": ".html",
|
|
365
|
+
"text/plain": ".txt",
|
|
366
|
+
"text/markdown": ".md",
|
|
367
|
+
"application/pdf": ".pdf",
|
|
368
|
+
"application/json": ".json",
|
|
369
|
+
"text/csv": ".csv",
|
|
370
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
371
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
372
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
# Try content type first
|
|
376
|
+
for ct, ext in content_type_map.items():
|
|
377
|
+
if ct in content_type:
|
|
378
|
+
return ext
|
|
379
|
+
|
|
380
|
+
# Try URL extension
|
|
381
|
+
parsed_url = urlparse(url)
|
|
382
|
+
path = Path(parsed_url.path)
|
|
383
|
+
if path.suffix:
|
|
384
|
+
return path.suffix.lower()
|
|
385
|
+
|
|
386
|
+
# Default to .html for web content
|
|
387
|
+
return ".html"
|
|
388
|
+
|
|
389
|
+
async def get_document_by_id(self, document_id: str) -> Document | None:
|
|
390
|
+
"""Get a document by its ID.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
document_id: The unique identifier of the document.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
The Document instance if found, None otherwise.
|
|
397
|
+
"""
|
|
398
|
+
return await self.document_repository.get_by_id(document_id)
|
|
399
|
+
|
|
400
|
+
async def get_document_by_uri(self, uri: str) -> Document | None:
|
|
401
|
+
"""Get a document by its URI.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
uri: The URI identifier of the document.
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
The Document instance if found, None otherwise.
|
|
408
|
+
"""
|
|
409
|
+
return await self.document_repository.get_by_uri(uri)
|
|
410
|
+
|
|
411
|
+
async def update_document(self, document: Document) -> Document:
|
|
412
|
+
"""Update an existing document."""
|
|
413
|
+
# Lazy import to avoid loading docling
|
|
414
|
+
from haiku.rag.utils import text_to_docling_document
|
|
415
|
+
|
|
416
|
+
# Convert content to DoclingDocument
|
|
417
|
+
docling_document = text_to_docling_document(document.content)
|
|
418
|
+
|
|
419
|
+
return await self.document_repository._update_and_rechunk(
|
|
420
|
+
document, docling_document
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
async def delete_document(self, document_id: str) -> bool:
|
|
424
|
+
"""Delete a document by its ID."""
|
|
425
|
+
return await self.document_repository.delete(document_id)
|
|
426
|
+
|
|
427
|
+
async def list_documents(
|
|
428
|
+
self,
|
|
429
|
+
limit: int | None = None,
|
|
430
|
+
offset: int | None = None,
|
|
431
|
+
filter: str | None = None,
|
|
432
|
+
) -> list[Document]:
|
|
433
|
+
"""List all documents with optional pagination and filtering.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
limit: Maximum number of documents to return.
|
|
437
|
+
offset: Number of documents to skip.
|
|
438
|
+
filter: Optional SQL WHERE clause to filter documents.
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
List of Document instances matching the criteria.
|
|
442
|
+
"""
|
|
443
|
+
return await self.document_repository.list_all(
|
|
444
|
+
limit=limit, offset=offset, filter=filter
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
async def search(
|
|
448
|
+
self,
|
|
449
|
+
query: str,
|
|
450
|
+
limit: int = 5,
|
|
451
|
+
search_type: str = "hybrid",
|
|
452
|
+
filter: str | None = None,
|
|
453
|
+
) -> list[tuple[Chunk, float]]:
|
|
454
|
+
"""Search for relevant chunks using the specified search method with optional reranking.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
query: The search query string.
|
|
458
|
+
limit: Maximum number of results to return.
|
|
459
|
+
search_type: Type of search - "vector", "fts", or "hybrid" (default).
|
|
460
|
+
filter: Optional SQL WHERE clause to filter documents before searching chunks.
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
List of (chunk, score) tuples ordered by relevance.
|
|
464
|
+
"""
|
|
465
|
+
# Get reranker if available
|
|
466
|
+
reranker = get_reranker(config=self._config)
|
|
467
|
+
|
|
468
|
+
if reranker is None:
|
|
469
|
+
# No reranking - return direct search results
|
|
470
|
+
return await self.chunk_repository.search(query, limit, search_type, filter)
|
|
471
|
+
|
|
472
|
+
# Get more initial results (3X) for reranking
|
|
473
|
+
search_limit = limit * 3
|
|
474
|
+
search_results = await self.chunk_repository.search(
|
|
475
|
+
query, search_limit, search_type, filter
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# Apply reranking
|
|
479
|
+
chunks = [chunk for chunk, _ in search_results]
|
|
480
|
+
reranked_results = await reranker.rerank(query, chunks, top_n=limit)
|
|
481
|
+
|
|
482
|
+
# Return reranked results with scores from reranker
|
|
483
|
+
return reranked_results
|
|
484
|
+
|
|
485
|
+
async def expand_context(
|
|
486
|
+
self,
|
|
487
|
+
search_results: list[tuple[Chunk, float]],
|
|
488
|
+
radius: int | None = None,
|
|
489
|
+
) -> list[tuple[Chunk, float]]:
|
|
490
|
+
"""Expand search results with adjacent chunks, merging overlapping chunks.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
search_results: List of (chunk, score) tuples from search.
|
|
494
|
+
radius: Number of adjacent chunks to include before/after each chunk.
|
|
495
|
+
If None, uses config.processing.context_chunk_radius.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
List of (chunk, score) tuples with expanded and merged context chunks.
|
|
499
|
+
"""
|
|
500
|
+
if radius is None:
|
|
501
|
+
radius = self._config.processing.context_chunk_radius
|
|
502
|
+
if radius == 0:
|
|
503
|
+
return search_results
|
|
504
|
+
|
|
505
|
+
# Group chunks by document_id to handle merging within documents
|
|
506
|
+
document_groups = {}
|
|
507
|
+
for chunk, score in search_results:
|
|
508
|
+
doc_id = chunk.document_id
|
|
509
|
+
if doc_id not in document_groups:
|
|
510
|
+
document_groups[doc_id] = []
|
|
511
|
+
document_groups[doc_id].append((chunk, score))
|
|
512
|
+
|
|
513
|
+
results = []
|
|
514
|
+
|
|
515
|
+
for doc_id, doc_chunks in document_groups.items():
|
|
516
|
+
# Get all expanded ranges for this document
|
|
517
|
+
expanded_ranges = []
|
|
518
|
+
for chunk, score in doc_chunks:
|
|
519
|
+
adjacent_chunks = await self.chunk_repository.get_adjacent_chunks(
|
|
520
|
+
chunk, radius
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
all_chunks = adjacent_chunks + [chunk]
|
|
524
|
+
|
|
525
|
+
# Get the range of orders for this expanded chunk
|
|
526
|
+
orders = [c.order for c in all_chunks]
|
|
527
|
+
min_order = min(orders)
|
|
528
|
+
max_order = max(orders)
|
|
529
|
+
|
|
530
|
+
expanded_ranges.append(
|
|
531
|
+
{
|
|
532
|
+
"original_chunk": chunk,
|
|
533
|
+
"score": score,
|
|
534
|
+
"min_order": min_order,
|
|
535
|
+
"max_order": max_order,
|
|
536
|
+
"all_chunks": sorted(all_chunks, key=lambda c: c.order),
|
|
537
|
+
}
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Merge overlapping/adjacent ranges
|
|
541
|
+
merged_ranges = self._merge_overlapping_ranges(expanded_ranges)
|
|
542
|
+
|
|
543
|
+
# Create merged chunks
|
|
544
|
+
for merged_range in merged_ranges:
|
|
545
|
+
combined_content_parts = [c.content for c in merged_range["all_chunks"]]
|
|
546
|
+
|
|
547
|
+
# Use the first original chunk for metadata
|
|
548
|
+
original_chunk = merged_range["original_chunks"][0]
|
|
549
|
+
|
|
550
|
+
merged_chunk = Chunk(
|
|
551
|
+
id=original_chunk.id,
|
|
552
|
+
document_id=original_chunk.document_id,
|
|
553
|
+
content="".join(combined_content_parts),
|
|
554
|
+
metadata=original_chunk.metadata,
|
|
555
|
+
document_uri=original_chunk.document_uri,
|
|
556
|
+
document_title=original_chunk.document_title,
|
|
557
|
+
document_meta=original_chunk.document_meta,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
# Use the highest score from merged chunks
|
|
561
|
+
best_score = max(merged_range["scores"])
|
|
562
|
+
results.append((merged_chunk, best_score))
|
|
563
|
+
|
|
564
|
+
return results
|
|
565
|
+
|
|
566
|
+
def _merge_overlapping_ranges(self, expanded_ranges):
|
|
567
|
+
"""Merge overlapping or adjacent expanded ranges."""
|
|
568
|
+
if not expanded_ranges:
|
|
569
|
+
return []
|
|
570
|
+
|
|
571
|
+
# Sort by min_order
|
|
572
|
+
sorted_ranges = sorted(expanded_ranges, key=lambda x: x["min_order"])
|
|
573
|
+
merged = []
|
|
574
|
+
|
|
575
|
+
current = {
|
|
576
|
+
"min_order": sorted_ranges[0]["min_order"],
|
|
577
|
+
"max_order": sorted_ranges[0]["max_order"],
|
|
578
|
+
"original_chunks": [sorted_ranges[0]["original_chunk"]],
|
|
579
|
+
"scores": [sorted_ranges[0]["score"]],
|
|
580
|
+
"all_chunks": sorted_ranges[0]["all_chunks"],
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
for range_info in sorted_ranges[1:]:
|
|
584
|
+
# Check if ranges overlap or are adjacent (max_order + 1 >= min_order)
|
|
585
|
+
if current["max_order"] >= range_info["min_order"] - 1:
|
|
586
|
+
# Merge ranges
|
|
587
|
+
current["max_order"] = max(
|
|
588
|
+
current["max_order"], range_info["max_order"]
|
|
589
|
+
)
|
|
590
|
+
current["original_chunks"].append(range_info["original_chunk"])
|
|
591
|
+
current["scores"].append(range_info["score"])
|
|
592
|
+
|
|
593
|
+
# Merge all_chunks and deduplicate by order
|
|
594
|
+
all_chunks_dict = {}
|
|
595
|
+
for chunk in current["all_chunks"] + range_info["all_chunks"]:
|
|
596
|
+
order = chunk.order
|
|
597
|
+
all_chunks_dict[order] = chunk
|
|
598
|
+
current["all_chunks"] = [
|
|
599
|
+
all_chunks_dict[order] for order in sorted(all_chunks_dict.keys())
|
|
600
|
+
]
|
|
601
|
+
else:
|
|
602
|
+
# No overlap, add current to merged and start new
|
|
603
|
+
merged.append(current)
|
|
604
|
+
current = {
|
|
605
|
+
"min_order": range_info["min_order"],
|
|
606
|
+
"max_order": range_info["max_order"],
|
|
607
|
+
"original_chunks": [range_info["original_chunk"]],
|
|
608
|
+
"scores": [range_info["score"]],
|
|
609
|
+
"all_chunks": range_info["all_chunks"],
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
# Add the last range
|
|
613
|
+
merged.append(current)
|
|
614
|
+
return merged
|
|
615
|
+
|
|
616
|
+
async def ask(
|
|
617
|
+
self, question: str, cite: bool = False, system_prompt: str | None = None
|
|
618
|
+
) -> str:
|
|
619
|
+
"""Ask a question using the configured QA agent.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
question: The question to ask.
|
|
623
|
+
cite: Whether to include citations in the response.
|
|
624
|
+
system_prompt: Optional custom system prompt for the QA agent.
|
|
625
|
+
|
|
626
|
+
Returns:
|
|
627
|
+
The generated answer as a string.
|
|
628
|
+
"""
|
|
629
|
+
from haiku.rag.qa import get_qa_agent
|
|
630
|
+
|
|
631
|
+
qa_agent = get_qa_agent(
|
|
632
|
+
self, config=self._config, use_citations=cite, system_prompt=system_prompt
|
|
633
|
+
)
|
|
634
|
+
return await qa_agent.answer(question)
|
|
635
|
+
|
|
636
|
+
async def rebuild_database(self) -> AsyncGenerator[str, None]:
|
|
637
|
+
"""Rebuild the database by deleting all chunks and re-indexing all documents.
|
|
638
|
+
|
|
639
|
+
For documents with URIs:
|
|
640
|
+
- Re-adds from source if source exists
|
|
641
|
+
- Re-embeds from existing content if source is missing
|
|
642
|
+
|
|
643
|
+
For documents without URIs:
|
|
644
|
+
- Re-creates chunks from existing content
|
|
645
|
+
|
|
646
|
+
Yields:
|
|
647
|
+
int: The ID of the document currently being processed
|
|
648
|
+
"""
|
|
649
|
+
# Lazy import to avoid loading docling
|
|
650
|
+
from haiku.rag.utils import text_to_docling_document
|
|
651
|
+
|
|
652
|
+
await self.chunk_repository.delete_all()
|
|
653
|
+
self.store.recreate_embeddings_table()
|
|
654
|
+
|
|
655
|
+
# Update settings to current config
|
|
656
|
+
settings_repo = SettingsRepository(self.store)
|
|
657
|
+
settings_repo.save_current_settings()
|
|
658
|
+
|
|
659
|
+
documents = await self.list_documents()
|
|
660
|
+
|
|
661
|
+
for doc in documents:
|
|
662
|
+
assert doc.id is not None, "Document ID should not be None"
|
|
663
|
+
if doc.uri:
|
|
664
|
+
# Document has a URI - check if source is accessible
|
|
665
|
+
source_accessible = False
|
|
666
|
+
parsed_url = urlparse(doc.uri)
|
|
667
|
+
|
|
668
|
+
try:
|
|
669
|
+
if parsed_url.scheme == "file":
|
|
670
|
+
# Check if file exists
|
|
671
|
+
source_path = Path(parsed_url.path)
|
|
672
|
+
source_accessible = source_path.exists()
|
|
673
|
+
elif parsed_url.scheme in ("http", "https"):
|
|
674
|
+
# For URLs, we'll try to create and catch errors
|
|
675
|
+
source_accessible = True
|
|
676
|
+
else:
|
|
677
|
+
source_accessible = False
|
|
678
|
+
except Exception:
|
|
679
|
+
source_accessible = False
|
|
680
|
+
|
|
681
|
+
if source_accessible:
|
|
682
|
+
# Source exists - delete and recreate from source
|
|
683
|
+
try:
|
|
684
|
+
await self.delete_document(doc.id)
|
|
685
|
+
new_doc = await self.create_document_from_source(
|
|
686
|
+
source=doc.uri, metadata=doc.metadata or {}
|
|
687
|
+
)
|
|
688
|
+
# URIs always point to single files/URLs, never directories
|
|
689
|
+
assert isinstance(new_doc, Document)
|
|
690
|
+
assert new_doc.id is not None, (
|
|
691
|
+
"New document ID should not be None"
|
|
692
|
+
)
|
|
693
|
+
yield new_doc.id
|
|
694
|
+
except Exception as e:
|
|
695
|
+
logger.error(
|
|
696
|
+
"Error recreating document from source %s: %s",
|
|
697
|
+
doc.uri,
|
|
698
|
+
e,
|
|
699
|
+
)
|
|
700
|
+
continue
|
|
701
|
+
else:
|
|
702
|
+
# Source missing - re-embed from existing content
|
|
703
|
+
logger.warning(
|
|
704
|
+
"Source missing for %s, re-embedding from content", doc.uri
|
|
705
|
+
)
|
|
706
|
+
docling_document = text_to_docling_document(doc.content)
|
|
707
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
708
|
+
doc.id, docling_document
|
|
709
|
+
)
|
|
710
|
+
yield doc.id
|
|
711
|
+
else:
|
|
712
|
+
# Document without URI - re-create chunks from existing content
|
|
713
|
+
docling_document = text_to_docling_document(doc.content)
|
|
714
|
+
await self.chunk_repository.create_chunks_for_document(
|
|
715
|
+
doc.id, docling_document
|
|
716
|
+
)
|
|
717
|
+
yield doc.id
|
|
718
|
+
|
|
719
|
+
# Final maintenance: centralized vacuum to curb disk usage
|
|
720
|
+
try:
|
|
721
|
+
await self.store.vacuum()
|
|
722
|
+
except Exception:
|
|
723
|
+
pass
|
|
724
|
+
|
|
725
|
+
async def vacuum(self) -> None:
|
|
726
|
+
"""Optimize and clean up old versions across all tables."""
|
|
727
|
+
await self.store.vacuum()
|
|
728
|
+
|
|
729
|
+
def close(self):
|
|
730
|
+
"""Close the underlying store connection."""
|
|
731
|
+
self.store.close()
|