haiku.rag 0.10.2__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +205 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/METADATA +100 -41
- haiku_rag-0.14.0.dist-info/RECORD +6 -0
- haiku/rag/__init__.py +0 -0
- haiku/rag/app.py +0 -437
- haiku/rag/chunker.py +0 -51
- haiku/rag/cli.py +0 -466
- haiku/rag/client.py +0 -605
- haiku/rag/config.py +0 -81
- haiku/rag/embeddings/__init__.py +0 -35
- haiku/rag/embeddings/base.py +0 -15
- haiku/rag/embeddings/ollama.py +0 -17
- haiku/rag/embeddings/openai.py +0 -16
- haiku/rag/embeddings/vllm.py +0 -19
- haiku/rag/embeddings/voyageai.py +0 -17
- haiku/rag/logging.py +0 -56
- haiku/rag/mcp.py +0 -156
- haiku/rag/migration.py +0 -316
- haiku/rag/monitor.py +0 -73
- haiku/rag/qa/__init__.py +0 -15
- haiku/rag/qa/agent.py +0 -91
- haiku/rag/qa/prompts.py +0 -60
- haiku/rag/reader.py +0 -115
- haiku/rag/reranking/__init__.py +0 -34
- haiku/rag/reranking/base.py +0 -13
- haiku/rag/reranking/cohere.py +0 -34
- haiku/rag/reranking/mxbai.py +0 -28
- haiku/rag/reranking/vllm.py +0 -44
- haiku/rag/research/__init__.py +0 -20
- haiku/rag/research/common.py +0 -53
- haiku/rag/research/dependencies.py +0 -47
- haiku/rag/research/graph.py +0 -29
- haiku/rag/research/models.py +0 -70
- haiku/rag/research/nodes/evaluate.py +0 -80
- haiku/rag/research/nodes/plan.py +0 -63
- haiku/rag/research/nodes/search.py +0 -93
- haiku/rag/research/nodes/synthesize.py +0 -51
- haiku/rag/research/prompts.py +0 -114
- haiku/rag/research/state.py +0 -25
- haiku/rag/store/__init__.py +0 -4
- haiku/rag/store/engine.py +0 -269
- haiku/rag/store/models/__init__.py +0 -4
- haiku/rag/store/models/chunk.py +0 -17
- haiku/rag/store/models/document.py +0 -17
- haiku/rag/store/repositories/__init__.py +0 -9
- haiku/rag/store/repositories/chunk.py +0 -424
- haiku/rag/store/repositories/document.py +0 -237
- haiku/rag/store/repositories/settings.py +0 -155
- haiku/rag/store/upgrades/__init__.py +0 -62
- haiku/rag/store/upgrades/v0_10_1.py +0 -64
- haiku/rag/store/upgrades/v0_9_3.py +0 -112
- haiku/rag/utils.py +0 -199
- haiku_rag-0.10.2.dist-info/RECORD +0 -54
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/WHEEL +0 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/client.py
DELETED
|
@@ -1,605 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import mimetypes
|
|
3
|
-
import tempfile
|
|
4
|
-
from collections.abc import AsyncGenerator
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from urllib.parse import urlparse
|
|
7
|
-
|
|
8
|
-
import httpx
|
|
9
|
-
|
|
10
|
-
from haiku.rag.config import Config
|
|
11
|
-
from haiku.rag.reader import FileReader
|
|
12
|
-
from haiku.rag.reranking import get_reranker
|
|
13
|
-
from haiku.rag.store.engine import Store
|
|
14
|
-
from haiku.rag.store.models.chunk import Chunk
|
|
15
|
-
from haiku.rag.store.models.document import Document
|
|
16
|
-
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
17
|
-
from haiku.rag.store.repositories.document import DocumentRepository
|
|
18
|
-
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
19
|
-
from haiku.rag.utils import text_to_docling_document
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class HaikuRAG:
|
|
23
|
-
"""High-level haiku-rag client."""
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
db_path: Path = Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
|
|
28
|
-
skip_validation: bool = False,
|
|
29
|
-
):
|
|
30
|
-
"""Initialize the RAG client with a database path.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
db_path: Path to the database file.
|
|
34
|
-
skip_validation: Whether to skip configuration validation on database load.
|
|
35
|
-
"""
|
|
36
|
-
self.store = Store(db_path, skip_validation=skip_validation)
|
|
37
|
-
self.document_repository = DocumentRepository(self.store)
|
|
38
|
-
self.chunk_repository = ChunkRepository(self.store)
|
|
39
|
-
|
|
40
|
-
async def __aenter__(self):
|
|
41
|
-
"""Async context manager entry."""
|
|
42
|
-
return self
|
|
43
|
-
|
|
44
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb): # noqa: ARG002
|
|
45
|
-
"""Async context manager exit."""
|
|
46
|
-
self.close()
|
|
47
|
-
return False
|
|
48
|
-
|
|
49
|
-
async def _create_document_with_docling(
|
|
50
|
-
self,
|
|
51
|
-
docling_document,
|
|
52
|
-
uri: str | None = None,
|
|
53
|
-
title: str | None = None,
|
|
54
|
-
metadata: dict | None = None,
|
|
55
|
-
chunks: list[Chunk] | None = None,
|
|
56
|
-
) -> Document:
|
|
57
|
-
"""Create a new document from DoclingDocument."""
|
|
58
|
-
content = docling_document.export_to_markdown()
|
|
59
|
-
document = Document(
|
|
60
|
-
content=content,
|
|
61
|
-
uri=uri,
|
|
62
|
-
title=title,
|
|
63
|
-
metadata=metadata or {},
|
|
64
|
-
)
|
|
65
|
-
return await self.document_repository._create_with_docling(
|
|
66
|
-
document, docling_document, chunks
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
async def create_document(
|
|
70
|
-
self,
|
|
71
|
-
content: str,
|
|
72
|
-
uri: str | None = None,
|
|
73
|
-
title: str | None = None,
|
|
74
|
-
metadata: dict | None = None,
|
|
75
|
-
chunks: list[Chunk] | None = None,
|
|
76
|
-
) -> Document:
|
|
77
|
-
"""Create a new document with optional URI and metadata.
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
content: The text content of the document.
|
|
81
|
-
uri: Optional URI identifier for the document.
|
|
82
|
-
metadata: Optional metadata dictionary.
|
|
83
|
-
chunks: Optional list of pre-created chunks to use instead of generating new ones.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
The created Document instance.
|
|
87
|
-
"""
|
|
88
|
-
# Convert content to DoclingDocument for processing
|
|
89
|
-
docling_document = text_to_docling_document(content)
|
|
90
|
-
|
|
91
|
-
document = Document(
|
|
92
|
-
content=content,
|
|
93
|
-
uri=uri,
|
|
94
|
-
title=title,
|
|
95
|
-
metadata=metadata or {},
|
|
96
|
-
)
|
|
97
|
-
return await self.document_repository._create_with_docling(
|
|
98
|
-
document, docling_document, chunks
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
async def create_document_from_source(
|
|
102
|
-
self, source: str | Path, title: str | None = None, metadata: dict | None = None
|
|
103
|
-
) -> Document:
|
|
104
|
-
"""Create or update a document from a file path or URL.
|
|
105
|
-
|
|
106
|
-
Checks if a document with the same URI already exists:
|
|
107
|
-
- If MD5 is unchanged, returns existing document
|
|
108
|
-
- If MD5 changed, updates the document
|
|
109
|
-
- If no document exists, creates a new one
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
source: File path (as string or Path) or URL to parse
|
|
113
|
-
metadata: Optional metadata dictionary
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
Document instance (created, updated, or existing)
|
|
117
|
-
|
|
118
|
-
Raises:
|
|
119
|
-
ValueError: If the file/URL cannot be parsed or doesn't exist
|
|
120
|
-
httpx.RequestError: If URL request fails
|
|
121
|
-
"""
|
|
122
|
-
|
|
123
|
-
# Normalize metadata
|
|
124
|
-
metadata = metadata or {}
|
|
125
|
-
|
|
126
|
-
# Check if it's a URL
|
|
127
|
-
source_str = str(source)
|
|
128
|
-
parsed_url = urlparse(source_str)
|
|
129
|
-
if parsed_url.scheme in ("http", "https"):
|
|
130
|
-
return await self._create_or_update_document_from_url(
|
|
131
|
-
source_str, title=title, metadata=metadata
|
|
132
|
-
)
|
|
133
|
-
elif parsed_url.scheme == "file":
|
|
134
|
-
# Handle file:// URI by converting to path
|
|
135
|
-
source_path = Path(parsed_url.path)
|
|
136
|
-
else:
|
|
137
|
-
# Handle as regular file path
|
|
138
|
-
source_path = Path(source) if isinstance(source, str) else source
|
|
139
|
-
if source_path.suffix.lower() not in FileReader.extensions:
|
|
140
|
-
raise ValueError(f"Unsupported file extension: {source_path.suffix}")
|
|
141
|
-
|
|
142
|
-
if not source_path.exists():
|
|
143
|
-
raise ValueError(f"File does not exist: {source_path}")
|
|
144
|
-
|
|
145
|
-
uri = source_path.absolute().as_uri()
|
|
146
|
-
md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
|
|
147
|
-
|
|
148
|
-
# Get content type from file extension (do before early return)
|
|
149
|
-
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
150
|
-
if not content_type:
|
|
151
|
-
content_type = "application/octet-stream"
|
|
152
|
-
# Merge metadata with contentType and md5
|
|
153
|
-
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
154
|
-
|
|
155
|
-
# Check if document already exists
|
|
156
|
-
existing_doc = await self.get_document_by_uri(uri)
|
|
157
|
-
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
158
|
-
# MD5 unchanged; update title/metadata if provided
|
|
159
|
-
updated = False
|
|
160
|
-
if title is not None and title != existing_doc.title:
|
|
161
|
-
existing_doc.title = title
|
|
162
|
-
updated = True
|
|
163
|
-
if metadata:
|
|
164
|
-
existing_doc.metadata = {**(existing_doc.metadata or {}), **metadata}
|
|
165
|
-
updated = True
|
|
166
|
-
if updated:
|
|
167
|
-
return await self.document_repository.update(existing_doc)
|
|
168
|
-
return existing_doc
|
|
169
|
-
|
|
170
|
-
# Parse file only when content changed or new document
|
|
171
|
-
docling_document = FileReader.parse_file(source_path)
|
|
172
|
-
|
|
173
|
-
if existing_doc:
|
|
174
|
-
# Update existing document
|
|
175
|
-
existing_doc.content = docling_document.export_to_markdown()
|
|
176
|
-
existing_doc.metadata = metadata
|
|
177
|
-
if title is not None:
|
|
178
|
-
existing_doc.title = title
|
|
179
|
-
return await self.document_repository._update_with_docling(
|
|
180
|
-
existing_doc, docling_document
|
|
181
|
-
)
|
|
182
|
-
else:
|
|
183
|
-
# Create new document using DoclingDocument
|
|
184
|
-
return await self._create_document_with_docling(
|
|
185
|
-
docling_document=docling_document,
|
|
186
|
-
uri=uri,
|
|
187
|
-
title=title,
|
|
188
|
-
metadata=metadata,
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
async def _create_or_update_document_from_url(
|
|
192
|
-
self, url: str, title: str | None = None, metadata: dict | None = None
|
|
193
|
-
) -> Document:
|
|
194
|
-
"""Create or update a document from a URL by downloading and parsing the content.
|
|
195
|
-
|
|
196
|
-
Checks if a document with the same URI already exists:
|
|
197
|
-
- If MD5 is unchanged, returns existing document
|
|
198
|
-
- If MD5 changed, updates the document
|
|
199
|
-
- If no document exists, creates a new one
|
|
200
|
-
|
|
201
|
-
Args:
|
|
202
|
-
url: URL to download and parse
|
|
203
|
-
metadata: Optional metadata dictionary
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
Document instance (created, updated, or existing)
|
|
207
|
-
|
|
208
|
-
Raises:
|
|
209
|
-
ValueError: If the content cannot be parsed
|
|
210
|
-
httpx.RequestError: If URL request fails
|
|
211
|
-
"""
|
|
212
|
-
metadata = metadata or {}
|
|
213
|
-
|
|
214
|
-
async with httpx.AsyncClient() as client:
|
|
215
|
-
response = await client.get(url)
|
|
216
|
-
response.raise_for_status()
|
|
217
|
-
|
|
218
|
-
md5_hash = hashlib.md5(response.content).hexdigest()
|
|
219
|
-
|
|
220
|
-
# Get content type early (used for potential no-op update)
|
|
221
|
-
content_type = response.headers.get("content-type", "").lower()
|
|
222
|
-
|
|
223
|
-
# Check if document already exists
|
|
224
|
-
existing_doc = await self.get_document_by_uri(url)
|
|
225
|
-
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
226
|
-
# MD5 unchanged; update title/metadata if provided
|
|
227
|
-
updated = False
|
|
228
|
-
if title is not None and title != existing_doc.title:
|
|
229
|
-
existing_doc.title = title
|
|
230
|
-
updated = True
|
|
231
|
-
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
232
|
-
if metadata:
|
|
233
|
-
existing_doc.metadata = {
|
|
234
|
-
**(existing_doc.metadata or {}),
|
|
235
|
-
**metadata,
|
|
236
|
-
}
|
|
237
|
-
updated = True
|
|
238
|
-
if updated:
|
|
239
|
-
return await self.document_repository.update(existing_doc)
|
|
240
|
-
return existing_doc
|
|
241
|
-
file_extension = self._get_extension_from_content_type_or_url(
|
|
242
|
-
url, content_type
|
|
243
|
-
)
|
|
244
|
-
|
|
245
|
-
if file_extension not in FileReader.extensions:
|
|
246
|
-
raise ValueError(
|
|
247
|
-
f"Unsupported content type/extension: {content_type}/{file_extension}"
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
# Create a temporary file with the appropriate extension
|
|
251
|
-
with tempfile.NamedTemporaryFile(
|
|
252
|
-
mode="wb", suffix=file_extension
|
|
253
|
-
) as temp_file:
|
|
254
|
-
temp_file.write(response.content)
|
|
255
|
-
temp_file.flush() # Ensure content is written to disk
|
|
256
|
-
temp_path = Path(temp_file.name)
|
|
257
|
-
|
|
258
|
-
# Parse the content using FileReader
|
|
259
|
-
docling_document = FileReader.parse_file(temp_path)
|
|
260
|
-
|
|
261
|
-
# Merge metadata with contentType and md5
|
|
262
|
-
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
263
|
-
|
|
264
|
-
if existing_doc:
|
|
265
|
-
existing_doc.content = docling_document.export_to_markdown()
|
|
266
|
-
existing_doc.metadata = metadata
|
|
267
|
-
if title is not None:
|
|
268
|
-
existing_doc.title = title
|
|
269
|
-
return await self.document_repository._update_with_docling(
|
|
270
|
-
existing_doc, docling_document
|
|
271
|
-
)
|
|
272
|
-
else:
|
|
273
|
-
return await self._create_document_with_docling(
|
|
274
|
-
docling_document=docling_document,
|
|
275
|
-
uri=url,
|
|
276
|
-
title=title,
|
|
277
|
-
metadata=metadata,
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
def _get_extension_from_content_type_or_url(
|
|
281
|
-
self, url: str, content_type: str
|
|
282
|
-
) -> str:
|
|
283
|
-
"""Determine file extension from content type or URL."""
|
|
284
|
-
# Common content type mappings
|
|
285
|
-
content_type_map = {
|
|
286
|
-
"text/html": ".html",
|
|
287
|
-
"text/plain": ".txt",
|
|
288
|
-
"text/markdown": ".md",
|
|
289
|
-
"application/pdf": ".pdf",
|
|
290
|
-
"application/json": ".json",
|
|
291
|
-
"text/csv": ".csv",
|
|
292
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
293
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
294
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
# Try content type first
|
|
298
|
-
for ct, ext in content_type_map.items():
|
|
299
|
-
if ct in content_type:
|
|
300
|
-
return ext
|
|
301
|
-
|
|
302
|
-
# Try URL extension
|
|
303
|
-
parsed_url = urlparse(url)
|
|
304
|
-
path = Path(parsed_url.path)
|
|
305
|
-
if path.suffix:
|
|
306
|
-
return path.suffix.lower()
|
|
307
|
-
|
|
308
|
-
# Default to .html for web content
|
|
309
|
-
return ".html"
|
|
310
|
-
|
|
311
|
-
async def get_document_by_id(self, document_id: str) -> Document | None:
|
|
312
|
-
"""Get a document by its ID.
|
|
313
|
-
|
|
314
|
-
Args:
|
|
315
|
-
document_id: The unique identifier of the document.
|
|
316
|
-
|
|
317
|
-
Returns:
|
|
318
|
-
The Document instance if found, None otherwise.
|
|
319
|
-
"""
|
|
320
|
-
return await self.document_repository.get_by_id(document_id)
|
|
321
|
-
|
|
322
|
-
async def get_document_by_uri(self, uri: str) -> Document | None:
|
|
323
|
-
"""Get a document by its URI.
|
|
324
|
-
|
|
325
|
-
Args:
|
|
326
|
-
uri: The URI identifier of the document.
|
|
327
|
-
|
|
328
|
-
Returns:
|
|
329
|
-
The Document instance if found, None otherwise.
|
|
330
|
-
"""
|
|
331
|
-
return await self.document_repository.get_by_uri(uri)
|
|
332
|
-
|
|
333
|
-
async def update_document(self, document: Document) -> Document:
|
|
334
|
-
"""Update an existing document."""
|
|
335
|
-
# Convert content to DoclingDocument
|
|
336
|
-
docling_document = text_to_docling_document(document.content)
|
|
337
|
-
|
|
338
|
-
return await self.document_repository._update_with_docling(
|
|
339
|
-
document, docling_document
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
async def delete_document(self, document_id: str) -> bool:
|
|
343
|
-
"""Delete a document by its ID."""
|
|
344
|
-
return await self.document_repository.delete(document_id)
|
|
345
|
-
|
|
346
|
-
async def list_documents(
|
|
347
|
-
self, limit: int | None = None, offset: int | None = None
|
|
348
|
-
) -> list[Document]:
|
|
349
|
-
"""List all documents with optional pagination.
|
|
350
|
-
|
|
351
|
-
Args:
|
|
352
|
-
limit: Maximum number of documents to return.
|
|
353
|
-
offset: Number of documents to skip.
|
|
354
|
-
|
|
355
|
-
Returns:
|
|
356
|
-
List of Document instances.
|
|
357
|
-
"""
|
|
358
|
-
return await self.document_repository.list_all(limit=limit, offset=offset)
|
|
359
|
-
|
|
360
|
-
async def search(
|
|
361
|
-
self, query: str, limit: int = 5, search_type: str = "hybrid"
|
|
362
|
-
) -> list[tuple[Chunk, float]]:
|
|
363
|
-
"""Search for relevant chunks using the specified search method with optional reranking.
|
|
364
|
-
|
|
365
|
-
Args:
|
|
366
|
-
query: The search query string.
|
|
367
|
-
limit: Maximum number of results to return.
|
|
368
|
-
search_type: Type of search - "vector", "fts", or "hybrid" (default).
|
|
369
|
-
|
|
370
|
-
Returns:
|
|
371
|
-
List of (chunk, score) tuples ordered by relevance.
|
|
372
|
-
"""
|
|
373
|
-
# Get reranker if available
|
|
374
|
-
reranker = get_reranker()
|
|
375
|
-
|
|
376
|
-
if reranker is None:
|
|
377
|
-
# No reranking - return direct search results
|
|
378
|
-
return await self.chunk_repository.search(query, limit, search_type)
|
|
379
|
-
|
|
380
|
-
# Get more initial results (3X) for reranking
|
|
381
|
-
search_limit = limit * 3
|
|
382
|
-
search_results = await self.chunk_repository.search(
|
|
383
|
-
query, search_limit, search_type
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
# Apply reranking
|
|
387
|
-
chunks = [chunk for chunk, _ in search_results]
|
|
388
|
-
reranked_results = await reranker.rerank(query, chunks, top_n=limit)
|
|
389
|
-
|
|
390
|
-
# Return reranked results with scores from reranker
|
|
391
|
-
return reranked_results
|
|
392
|
-
|
|
393
|
-
async def expand_context(
|
|
394
|
-
self,
|
|
395
|
-
search_results: list[tuple[Chunk, float]],
|
|
396
|
-
radius: int = Config.CONTEXT_CHUNK_RADIUS,
|
|
397
|
-
) -> list[tuple[Chunk, float]]:
|
|
398
|
-
"""Expand search results with adjacent chunks, merging overlapping chunks.
|
|
399
|
-
|
|
400
|
-
Args:
|
|
401
|
-
search_results: List of (chunk, score) tuples from search.
|
|
402
|
-
radius: Number of adjacent chunks to include before/after each chunk.
|
|
403
|
-
Defaults to CONTEXT_CHUNK_RADIUS config setting.
|
|
404
|
-
|
|
405
|
-
Returns:
|
|
406
|
-
List of (chunk, score) tuples with expanded and merged context chunks.
|
|
407
|
-
"""
|
|
408
|
-
if radius == 0:
|
|
409
|
-
return search_results
|
|
410
|
-
|
|
411
|
-
# Group chunks by document_id to handle merging within documents
|
|
412
|
-
document_groups = {}
|
|
413
|
-
for chunk, score in search_results:
|
|
414
|
-
doc_id = chunk.document_id
|
|
415
|
-
if doc_id not in document_groups:
|
|
416
|
-
document_groups[doc_id] = []
|
|
417
|
-
document_groups[doc_id].append((chunk, score))
|
|
418
|
-
|
|
419
|
-
results = []
|
|
420
|
-
|
|
421
|
-
for doc_id, doc_chunks in document_groups.items():
|
|
422
|
-
# Get all expanded ranges for this document
|
|
423
|
-
expanded_ranges = []
|
|
424
|
-
for chunk, score in doc_chunks:
|
|
425
|
-
adjacent_chunks = await self.chunk_repository.get_adjacent_chunks(
|
|
426
|
-
chunk, radius
|
|
427
|
-
)
|
|
428
|
-
|
|
429
|
-
all_chunks = adjacent_chunks + [chunk]
|
|
430
|
-
|
|
431
|
-
# Get the range of orders for this expanded chunk
|
|
432
|
-
orders = [c.order for c in all_chunks]
|
|
433
|
-
min_order = min(orders)
|
|
434
|
-
max_order = max(orders)
|
|
435
|
-
|
|
436
|
-
expanded_ranges.append(
|
|
437
|
-
{
|
|
438
|
-
"original_chunk": chunk,
|
|
439
|
-
"score": score,
|
|
440
|
-
"min_order": min_order,
|
|
441
|
-
"max_order": max_order,
|
|
442
|
-
"all_chunks": sorted(all_chunks, key=lambda c: c.order),
|
|
443
|
-
}
|
|
444
|
-
)
|
|
445
|
-
|
|
446
|
-
# Merge overlapping/adjacent ranges
|
|
447
|
-
merged_ranges = self._merge_overlapping_ranges(expanded_ranges)
|
|
448
|
-
|
|
449
|
-
# Create merged chunks
|
|
450
|
-
for merged_range in merged_ranges:
|
|
451
|
-
combined_content_parts = [c.content for c in merged_range["all_chunks"]]
|
|
452
|
-
|
|
453
|
-
# Use the first original chunk for metadata
|
|
454
|
-
original_chunk = merged_range["original_chunks"][0]
|
|
455
|
-
|
|
456
|
-
merged_chunk = Chunk(
|
|
457
|
-
id=original_chunk.id,
|
|
458
|
-
document_id=original_chunk.document_id,
|
|
459
|
-
content="".join(combined_content_parts),
|
|
460
|
-
metadata=original_chunk.metadata,
|
|
461
|
-
document_uri=original_chunk.document_uri,
|
|
462
|
-
document_title=original_chunk.document_title,
|
|
463
|
-
document_meta=original_chunk.document_meta,
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
# Use the highest score from merged chunks
|
|
467
|
-
best_score = max(merged_range["scores"])
|
|
468
|
-
results.append((merged_chunk, best_score))
|
|
469
|
-
|
|
470
|
-
return results
|
|
471
|
-
|
|
472
|
-
def _merge_overlapping_ranges(self, expanded_ranges):
|
|
473
|
-
"""Merge overlapping or adjacent expanded ranges."""
|
|
474
|
-
if not expanded_ranges:
|
|
475
|
-
return []
|
|
476
|
-
|
|
477
|
-
# Sort by min_order
|
|
478
|
-
sorted_ranges = sorted(expanded_ranges, key=lambda x: x["min_order"])
|
|
479
|
-
merged = []
|
|
480
|
-
|
|
481
|
-
current = {
|
|
482
|
-
"min_order": sorted_ranges[0]["min_order"],
|
|
483
|
-
"max_order": sorted_ranges[0]["max_order"],
|
|
484
|
-
"original_chunks": [sorted_ranges[0]["original_chunk"]],
|
|
485
|
-
"scores": [sorted_ranges[0]["score"]],
|
|
486
|
-
"all_chunks": sorted_ranges[0]["all_chunks"],
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
for range_info in sorted_ranges[1:]:
|
|
490
|
-
# Check if ranges overlap or are adjacent (max_order + 1 >= min_order)
|
|
491
|
-
if current["max_order"] >= range_info["min_order"] - 1:
|
|
492
|
-
# Merge ranges
|
|
493
|
-
current["max_order"] = max(
|
|
494
|
-
current["max_order"], range_info["max_order"]
|
|
495
|
-
)
|
|
496
|
-
current["original_chunks"].append(range_info["original_chunk"])
|
|
497
|
-
current["scores"].append(range_info["score"])
|
|
498
|
-
|
|
499
|
-
# Merge all_chunks and deduplicate by order
|
|
500
|
-
all_chunks_dict = {}
|
|
501
|
-
for chunk in current["all_chunks"] + range_info["all_chunks"]:
|
|
502
|
-
order = chunk.order
|
|
503
|
-
all_chunks_dict[order] = chunk
|
|
504
|
-
current["all_chunks"] = [
|
|
505
|
-
all_chunks_dict[order] for order in sorted(all_chunks_dict.keys())
|
|
506
|
-
]
|
|
507
|
-
else:
|
|
508
|
-
# No overlap, add current to merged and start new
|
|
509
|
-
merged.append(current)
|
|
510
|
-
current = {
|
|
511
|
-
"min_order": range_info["min_order"],
|
|
512
|
-
"max_order": range_info["max_order"],
|
|
513
|
-
"original_chunks": [range_info["original_chunk"]],
|
|
514
|
-
"scores": [range_info["score"]],
|
|
515
|
-
"all_chunks": range_info["all_chunks"],
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
# Add the last range
|
|
519
|
-
merged.append(current)
|
|
520
|
-
return merged
|
|
521
|
-
|
|
522
|
-
async def ask(self, question: str, cite: bool = False) -> str:
|
|
523
|
-
"""Ask a question using the configured QA agent.
|
|
524
|
-
|
|
525
|
-
Args:
|
|
526
|
-
question: The question to ask.
|
|
527
|
-
cite: Whether to include citations in the response.
|
|
528
|
-
|
|
529
|
-
Returns:
|
|
530
|
-
The generated answer as a string.
|
|
531
|
-
"""
|
|
532
|
-
from haiku.rag.qa import get_qa_agent
|
|
533
|
-
|
|
534
|
-
qa_agent = get_qa_agent(self, use_citations=cite)
|
|
535
|
-
return await qa_agent.answer(question)
|
|
536
|
-
|
|
537
|
-
async def rebuild_database(self) -> AsyncGenerator[str, None]:
|
|
538
|
-
"""Rebuild the database by deleting all chunks and re-indexing all documents.
|
|
539
|
-
|
|
540
|
-
For documents with URIs:
|
|
541
|
-
- Deletes the document and re-adds it from source if source exists
|
|
542
|
-
- Skips documents where source no longer exists
|
|
543
|
-
|
|
544
|
-
For documents without URIs:
|
|
545
|
-
- Re-creates chunks from existing content
|
|
546
|
-
|
|
547
|
-
Yields:
|
|
548
|
-
int: The ID of the document currently being processed
|
|
549
|
-
"""
|
|
550
|
-
await self.chunk_repository.delete_all()
|
|
551
|
-
self.store.recreate_embeddings_table()
|
|
552
|
-
|
|
553
|
-
# Update settings to current config
|
|
554
|
-
settings_repo = SettingsRepository(self.store)
|
|
555
|
-
settings_repo.save_current_settings()
|
|
556
|
-
|
|
557
|
-
documents = await self.list_documents()
|
|
558
|
-
|
|
559
|
-
for doc in documents:
|
|
560
|
-
assert doc.id is not None, "Document ID should not be None"
|
|
561
|
-
if doc.uri:
|
|
562
|
-
# Document has a URI - delete and try to re-add from source
|
|
563
|
-
try:
|
|
564
|
-
# Delete the old document first
|
|
565
|
-
await self.delete_document(doc.id)
|
|
566
|
-
|
|
567
|
-
# Try to re-create from source (this creates the document with chunks)
|
|
568
|
-
new_doc = await self.create_document_from_source(
|
|
569
|
-
source=doc.uri, metadata=doc.metadata or {}
|
|
570
|
-
)
|
|
571
|
-
|
|
572
|
-
assert new_doc.id is not None, "New document ID should not be None"
|
|
573
|
-
yield new_doc.id
|
|
574
|
-
|
|
575
|
-
except (FileNotFoundError, ValueError, OSError) as e:
|
|
576
|
-
# Source doesn't exist or can't be accessed - document already deleted, skip
|
|
577
|
-
print(f"Skipping document with URI {doc.uri}: {e}")
|
|
578
|
-
continue
|
|
579
|
-
except Exception as e:
|
|
580
|
-
# Unexpected error - log it and skip
|
|
581
|
-
print(
|
|
582
|
-
f"Unexpected error processing document with URI {doc.uri}: {e}"
|
|
583
|
-
)
|
|
584
|
-
continue
|
|
585
|
-
else:
|
|
586
|
-
# Document without URI - re-create chunks from existing content
|
|
587
|
-
docling_document = text_to_docling_document(doc.content)
|
|
588
|
-
await self.chunk_repository.create_chunks_for_document(
|
|
589
|
-
doc.id, docling_document
|
|
590
|
-
)
|
|
591
|
-
yield doc.id
|
|
592
|
-
|
|
593
|
-
# Final maintenance: centralized vacuum to curb disk usage
|
|
594
|
-
try:
|
|
595
|
-
self.store.vacuum()
|
|
596
|
-
except Exception:
|
|
597
|
-
pass
|
|
598
|
-
|
|
599
|
-
async def vacuum(self) -> None:
|
|
600
|
-
"""Optimize and clean up old versions across all tables."""
|
|
601
|
-
self.store.vacuum()
|
|
602
|
-
|
|
603
|
-
def close(self):
|
|
604
|
-
"""Close the underlying store connection."""
|
|
605
|
-
self.store.close()
|
haiku/rag/config.py
DELETED
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
from dotenv import load_dotenv
|
|
5
|
-
from pydantic import BaseModel, field_validator
|
|
6
|
-
|
|
7
|
-
from haiku.rag.utils import get_default_data_dir
|
|
8
|
-
|
|
9
|
-
load_dotenv()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class AppConfig(BaseModel):
|
|
13
|
-
ENV: str = "production"
|
|
14
|
-
|
|
15
|
-
LANCEDB_API_KEY: str = ""
|
|
16
|
-
LANCEDB_URI: str = ""
|
|
17
|
-
LANCEDB_REGION: str = ""
|
|
18
|
-
|
|
19
|
-
DEFAULT_DATA_DIR: Path = get_default_data_dir()
|
|
20
|
-
MONITOR_DIRECTORIES: list[Path] = []
|
|
21
|
-
|
|
22
|
-
EMBEDDINGS_PROVIDER: str = "ollama"
|
|
23
|
-
EMBEDDINGS_MODEL: str = "mxbai-embed-large"
|
|
24
|
-
EMBEDDINGS_VECTOR_DIM: int = 1024
|
|
25
|
-
|
|
26
|
-
RERANK_PROVIDER: str = ""
|
|
27
|
-
RERANK_MODEL: str = ""
|
|
28
|
-
|
|
29
|
-
QA_PROVIDER: str = "ollama"
|
|
30
|
-
QA_MODEL: str = "gpt-oss"
|
|
31
|
-
|
|
32
|
-
# Research defaults (fallback to QA if not provided via env)
|
|
33
|
-
RESEARCH_PROVIDER: str = "ollama"
|
|
34
|
-
RESEARCH_MODEL: str = "gpt-oss"
|
|
35
|
-
|
|
36
|
-
CHUNK_SIZE: int = 256
|
|
37
|
-
CONTEXT_CHUNK_RADIUS: int = 0
|
|
38
|
-
|
|
39
|
-
# Optional dotted path or file path to a callable that preprocesses
|
|
40
|
-
# markdown content before chunking. Examples:
|
|
41
|
-
MARKDOWN_PREPROCESSOR: str = ""
|
|
42
|
-
|
|
43
|
-
OLLAMA_BASE_URL: str = "http://localhost:11434"
|
|
44
|
-
|
|
45
|
-
VLLM_EMBEDDINGS_BASE_URL: str = ""
|
|
46
|
-
VLLM_RERANK_BASE_URL: str = ""
|
|
47
|
-
VLLM_QA_BASE_URL: str = ""
|
|
48
|
-
VLLM_RESEARCH_BASE_URL: str = ""
|
|
49
|
-
|
|
50
|
-
# Provider keys
|
|
51
|
-
VOYAGE_API_KEY: str = ""
|
|
52
|
-
OPENAI_API_KEY: str = ""
|
|
53
|
-
ANTHROPIC_API_KEY: str = ""
|
|
54
|
-
COHERE_API_KEY: str = ""
|
|
55
|
-
|
|
56
|
-
# If true, refuse to auto-create a new LanceDB database or tables
|
|
57
|
-
# and error out when the database does not already exist.
|
|
58
|
-
DISABLE_DB_AUTOCREATE: bool = False
|
|
59
|
-
|
|
60
|
-
@field_validator("MONITOR_DIRECTORIES", mode="before")
|
|
61
|
-
@classmethod
|
|
62
|
-
def parse_monitor_directories(cls, v):
|
|
63
|
-
if isinstance(v, str):
|
|
64
|
-
if not v.strip():
|
|
65
|
-
return []
|
|
66
|
-
return [
|
|
67
|
-
Path(path.strip()).absolute() for path in v.split(",") if path.strip()
|
|
68
|
-
]
|
|
69
|
-
return v
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
# Expose Config object for app to import
|
|
73
|
-
Config = AppConfig.model_validate(os.environ)
|
|
74
|
-
if Config.OPENAI_API_KEY:
|
|
75
|
-
os.environ["OPENAI_API_KEY"] = Config.OPENAI_API_KEY
|
|
76
|
-
if Config.VOYAGE_API_KEY:
|
|
77
|
-
os.environ["VOYAGE_API_KEY"] = Config.VOYAGE_API_KEY
|
|
78
|
-
if Config.ANTHROPIC_API_KEY:
|
|
79
|
-
os.environ["ANTHROPIC_API_KEY"] = Config.ANTHROPIC_API_KEY
|
|
80
|
-
if Config.COHERE_API_KEY:
|
|
81
|
-
os.environ["CO_API_KEY"] = Config.COHERE_API_KEY
|