haiku.rag-slim 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (71) hide show
  1. haiku/rag/__init__.py +0 -0
  2. haiku/rag/app.py +542 -0
  3. haiku/rag/chunker.py +65 -0
  4. haiku/rag/cli.py +466 -0
  5. haiku/rag/client.py +731 -0
  6. haiku/rag/config/__init__.py +74 -0
  7. haiku/rag/config/loader.py +94 -0
  8. haiku/rag/config/models.py +99 -0
  9. haiku/rag/embeddings/__init__.py +49 -0
  10. haiku/rag/embeddings/base.py +25 -0
  11. haiku/rag/embeddings/ollama.py +28 -0
  12. haiku/rag/embeddings/openai.py +26 -0
  13. haiku/rag/embeddings/vllm.py +29 -0
  14. haiku/rag/embeddings/voyageai.py +27 -0
  15. haiku/rag/graph/__init__.py +26 -0
  16. haiku/rag/graph/agui/__init__.py +53 -0
  17. haiku/rag/graph/agui/cli_renderer.py +135 -0
  18. haiku/rag/graph/agui/emitter.py +197 -0
  19. haiku/rag/graph/agui/events.py +254 -0
  20. haiku/rag/graph/agui/server.py +310 -0
  21. haiku/rag/graph/agui/state.py +34 -0
  22. haiku/rag/graph/agui/stream.py +86 -0
  23. haiku/rag/graph/common/__init__.py +5 -0
  24. haiku/rag/graph/common/models.py +42 -0
  25. haiku/rag/graph/common/nodes.py +265 -0
  26. haiku/rag/graph/common/prompts.py +46 -0
  27. haiku/rag/graph/common/utils.py +44 -0
  28. haiku/rag/graph/deep_qa/__init__.py +1 -0
  29. haiku/rag/graph/deep_qa/dependencies.py +27 -0
  30. haiku/rag/graph/deep_qa/graph.py +243 -0
  31. haiku/rag/graph/deep_qa/models.py +20 -0
  32. haiku/rag/graph/deep_qa/prompts.py +59 -0
  33. haiku/rag/graph/deep_qa/state.py +56 -0
  34. haiku/rag/graph/research/__init__.py +3 -0
  35. haiku/rag/graph/research/common.py +87 -0
  36. haiku/rag/graph/research/dependencies.py +151 -0
  37. haiku/rag/graph/research/graph.py +295 -0
  38. haiku/rag/graph/research/models.py +166 -0
  39. haiku/rag/graph/research/prompts.py +107 -0
  40. haiku/rag/graph/research/state.py +85 -0
  41. haiku/rag/logging.py +56 -0
  42. haiku/rag/mcp.py +245 -0
  43. haiku/rag/monitor.py +194 -0
  44. haiku/rag/qa/__init__.py +33 -0
  45. haiku/rag/qa/agent.py +93 -0
  46. haiku/rag/qa/prompts.py +60 -0
  47. haiku/rag/reader.py +135 -0
  48. haiku/rag/reranking/__init__.py +63 -0
  49. haiku/rag/reranking/base.py +13 -0
  50. haiku/rag/reranking/cohere.py +34 -0
  51. haiku/rag/reranking/mxbai.py +28 -0
  52. haiku/rag/reranking/vllm.py +44 -0
  53. haiku/rag/reranking/zeroentropy.py +59 -0
  54. haiku/rag/store/__init__.py +4 -0
  55. haiku/rag/store/engine.py +309 -0
  56. haiku/rag/store/models/__init__.py +4 -0
  57. haiku/rag/store/models/chunk.py +17 -0
  58. haiku/rag/store/models/document.py +17 -0
  59. haiku/rag/store/repositories/__init__.py +9 -0
  60. haiku/rag/store/repositories/chunk.py +442 -0
  61. haiku/rag/store/repositories/document.py +261 -0
  62. haiku/rag/store/repositories/settings.py +165 -0
  63. haiku/rag/store/upgrades/__init__.py +62 -0
  64. haiku/rag/store/upgrades/v0_10_1.py +64 -0
  65. haiku/rag/store/upgrades/v0_9_3.py +112 -0
  66. haiku/rag/utils.py +211 -0
  67. haiku_rag_slim-0.16.0.dist-info/METADATA +128 -0
  68. haiku_rag_slim-0.16.0.dist-info/RECORD +71 -0
  69. haiku_rag_slim-0.16.0.dist-info/WHEEL +4 -0
  70. haiku_rag_slim-0.16.0.dist-info/entry_points.txt +2 -0
  71. haiku_rag_slim-0.16.0.dist-info/licenses/LICENSE +7 -0
haiku/rag/client.py ADDED
@@ -0,0 +1,731 @@
1
+ import hashlib
2
+ import logging
3
+ import mimetypes
4
+ import tempfile
5
+ from collections.abc import AsyncGenerator
6
+ from pathlib import Path
7
+ from urllib.parse import urlparse
8
+
9
+ import httpx
10
+
11
+ from haiku.rag.config import AppConfig, Config
12
+ from haiku.rag.reranking import get_reranker
13
+ from haiku.rag.store.engine import Store
14
+ from haiku.rag.store.models.chunk import Chunk
15
+ from haiku.rag.store.models.document import Document
16
+ from haiku.rag.store.repositories.chunk import ChunkRepository
17
+ from haiku.rag.store.repositories.document import DocumentRepository
18
+ from haiku.rag.store.repositories.settings import SettingsRepository
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class HaikuRAG:
24
+ """High-level haiku-rag client."""
25
+
26
+ def __init__(
27
+ self,
28
+ db_path: Path | None = None,
29
+ config: AppConfig = Config,
30
+ skip_validation: bool = False,
31
+ allow_create: bool = True,
32
+ ):
33
+ """Initialize the RAG client with a database path.
34
+
35
+ Args:
36
+ db_path: Path to the database file. If None, uses config.storage.data_dir.
37
+ config: Configuration to use. Defaults to global Config.
38
+ skip_validation: Whether to skip configuration validation on database load.
39
+ allow_create: Whether to allow database creation. If False, will raise error
40
+ if database doesn't exist (for read operations).
41
+ """
42
+ self._config = config
43
+ if db_path is None:
44
+ db_path = self._config.storage.data_dir / "haiku.rag.lancedb"
45
+ self.store = Store(
46
+ db_path,
47
+ config=self._config,
48
+ skip_validation=skip_validation,
49
+ allow_create=allow_create,
50
+ )
51
+ self.document_repository = DocumentRepository(self.store)
52
+ self.chunk_repository = ChunkRepository(self.store)
53
+
54
+ async def __aenter__(self):
55
+ """Async context manager entry."""
56
+ return self
57
+
58
+ async def __aexit__(self, exc_type, exc_val, exc_tb): # noqa: ARG002
59
+ """Async context manager exit."""
60
+ # Wait for any pending vacuum to complete before closing
61
+ async with self.store._vacuum_lock:
62
+ pass
63
+ self.close()
64
+ return False
65
+
66
+ async def _create_document_with_docling(
67
+ self,
68
+ docling_document,
69
+ uri: str | None = None,
70
+ title: str | None = None,
71
+ metadata: dict | None = None,
72
+ chunks: list[Chunk] | None = None,
73
+ ) -> Document:
74
+ """Create a new document from DoclingDocument."""
75
+ content = docling_document.export_to_markdown()
76
+ document = Document(
77
+ content=content,
78
+ uri=uri,
79
+ title=title,
80
+ metadata=metadata or {},
81
+ )
82
+ return await self.document_repository._create_and_chunk(
83
+ document, docling_document, chunks
84
+ )
85
+
86
+ async def create_document(
87
+ self,
88
+ content: str,
89
+ uri: str | None = None,
90
+ title: str | None = None,
91
+ metadata: dict | None = None,
92
+ chunks: list[Chunk] | None = None,
93
+ ) -> Document:
94
+ """Create a new document with optional URI and metadata.
95
+
96
+ Args:
97
+ content: The text content of the document.
98
+ uri: Optional URI identifier for the document.
99
+ metadata: Optional metadata dictionary.
100
+ chunks: Optional list of pre-created chunks to use instead of generating new ones.
101
+
102
+ Returns:
103
+ The created Document instance.
104
+ """
105
+ document = Document(
106
+ content=content,
107
+ uri=uri,
108
+ title=title,
109
+ metadata=metadata or {},
110
+ )
111
+
112
+ # Only create docling_document if we need to generate chunks
113
+ if chunks is None:
114
+ # Lazy import to avoid loading docling
115
+ from haiku.rag.utils import text_to_docling_document
116
+
117
+ docling_document = text_to_docling_document(content)
118
+ else:
119
+ # Chunks already provided, no conversion needed
120
+ docling_document = None
121
+
122
+ return await self.document_repository._create_and_chunk(
123
+ document, docling_document, chunks
124
+ )
125
+
126
+ async def create_document_from_source(
127
+ self, source: str | Path, title: str | None = None, metadata: dict | None = None
128
+ ) -> Document | list[Document]:
129
+ """Create or update document(s) from a file path, directory, or URL.
130
+
131
+ Checks if a document with the same URI already exists:
132
+ - If MD5 is unchanged, returns existing document
133
+ - If MD5 changed, updates the document
134
+ - If no document exists, creates a new one
135
+
136
+ Args:
137
+ source: File path, directory (as string or Path), or URL to parse
138
+ title: Optional title (only used for single files, not directories)
139
+ metadata: Optional metadata dictionary
140
+
141
+ Returns:
142
+ Document instance (created, updated, or existing) for single files/URLs
143
+ List of Document instances for directories
144
+
145
+ Raises:
146
+ ValueError: If the file/URL cannot be parsed or doesn't exist
147
+ httpx.RequestError: If URL request fails
148
+ """
149
+ # Normalize metadata
150
+ metadata = metadata or {}
151
+
152
+ # Check if it's a URL
153
+ source_str = str(source)
154
+ parsed_url = urlparse(source_str)
155
+ if parsed_url.scheme in ("http", "https"):
156
+ return await self._create_or_update_document_from_url(
157
+ source_str, title=title, metadata=metadata
158
+ )
159
+ elif parsed_url.scheme == "file":
160
+ # Handle file:// URI by converting to path
161
+ source_path = Path(parsed_url.path)
162
+ else:
163
+ # Handle as regular file path
164
+ source_path = Path(source) if isinstance(source, str) else source
165
+
166
+ # Handle directories
167
+ if source_path.is_dir():
168
+ from haiku.rag.monitor import FileFilter
169
+
170
+ documents = []
171
+ filter = FileFilter(
172
+ ignore_patterns=self._config.monitor.ignore_patterns or None,
173
+ include_patterns=self._config.monitor.include_patterns or None,
174
+ )
175
+ for path in source_path.rglob("*"):
176
+ if path.is_file() and filter.include_file(str(path)):
177
+ doc = await self._create_document_from_file(
178
+ path, title=None, metadata=metadata
179
+ )
180
+ documents.append(doc)
181
+ return documents
182
+
183
+ # Handle single file
184
+ return await self._create_document_from_file(
185
+ source_path, title=title, metadata=metadata
186
+ )
187
+
188
+ async def _create_document_from_file(
189
+ self, source_path: Path, title: str | None = None, metadata: dict | None = None
190
+ ) -> Document:
191
+ """Create or update a document from a single file path.
192
+
193
+ Args:
194
+ source_path: Path to the file
195
+ title: Optional title
196
+ metadata: Optional metadata dictionary
197
+
198
+ Returns:
199
+ Document instance (created, updated, or existing)
200
+
201
+ Raises:
202
+ ValueError: If the file cannot be parsed or doesn't exist
203
+ """
204
+ # Lazy import to avoid loading docling
205
+ from haiku.rag.reader import FileReader
206
+
207
+ metadata = metadata or {}
208
+
209
+ if source_path.suffix.lower() not in FileReader.extensions:
210
+ raise ValueError(f"Unsupported file extension: {source_path.suffix}")
211
+
212
+ if not source_path.exists():
213
+ raise ValueError(f"File does not exist: {source_path}")
214
+
215
+ uri = source_path.absolute().as_uri()
216
+ md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
217
+
218
+ # Get content type from file extension (do before early return)
219
+ content_type, _ = mimetypes.guess_type(str(source_path))
220
+ if not content_type:
221
+ content_type = "application/octet-stream"
222
+ # Merge metadata with contentType and md5
223
+ metadata.update({"contentType": content_type, "md5": md5_hash})
224
+
225
+ # Check if document already exists
226
+ existing_doc = await self.get_document_by_uri(uri)
227
+ if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
228
+ # MD5 unchanged; update title/metadata if provided
229
+ updated = False
230
+ if title is not None and title != existing_doc.title:
231
+ existing_doc.title = title
232
+ updated = True
233
+
234
+ # Check if metadata actually changed (beyond contentType and md5)
235
+ merged_metadata = {**(existing_doc.metadata or {}), **metadata}
236
+ if merged_metadata != existing_doc.metadata:
237
+ existing_doc.metadata = merged_metadata
238
+ updated = True
239
+
240
+ if updated:
241
+ return await self.document_repository.update(existing_doc)
242
+ return existing_doc
243
+
244
+ # Parse file only when content changed or new document
245
+ docling_document = FileReader.parse_file(source_path)
246
+
247
+ if existing_doc:
248
+ # Update existing document
249
+ existing_doc.content = docling_document.export_to_markdown()
250
+ existing_doc.metadata = metadata
251
+ if title is not None:
252
+ existing_doc.title = title
253
+ return await self.document_repository._update_and_rechunk(
254
+ existing_doc, docling_document
255
+ )
256
+ else:
257
+ # Create new document using DoclingDocument
258
+ return await self._create_document_with_docling(
259
+ docling_document=docling_document,
260
+ uri=uri,
261
+ title=title,
262
+ metadata=metadata,
263
+ )
264
+
265
+ async def _create_or_update_document_from_url(
266
+ self, url: str, title: str | None = None, metadata: dict | None = None
267
+ ) -> Document:
268
+ """Create or update a document from a URL by downloading and parsing the content.
269
+
270
+ Checks if a document with the same URI already exists:
271
+ - If MD5 is unchanged, returns existing document
272
+ - If MD5 changed, updates the document
273
+ - If no document exists, creates a new one
274
+
275
+ Args:
276
+ url: URL to download and parse
277
+ metadata: Optional metadata dictionary
278
+
279
+ Returns:
280
+ Document instance (created, updated, or existing)
281
+
282
+ Raises:
283
+ ValueError: If the content cannot be parsed
284
+ httpx.RequestError: If URL request fails
285
+ """
286
+ # Lazy import to avoid loading docling
287
+ from haiku.rag.reader import FileReader
288
+
289
+ metadata = metadata or {}
290
+
291
+ async with httpx.AsyncClient() as client:
292
+ response = await client.get(url)
293
+ response.raise_for_status()
294
+
295
+ md5_hash = hashlib.md5(response.content).hexdigest()
296
+
297
+ # Get content type early (used for potential no-op update)
298
+ content_type = response.headers.get("content-type", "").lower()
299
+
300
+ # Check if document already exists
301
+ existing_doc = await self.get_document_by_uri(url)
302
+ if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
303
+ # MD5 unchanged; update title/metadata if provided
304
+ updated = False
305
+ if title is not None and title != existing_doc.title:
306
+ existing_doc.title = title
307
+ updated = True
308
+
309
+ metadata.update({"contentType": content_type, "md5": md5_hash})
310
+ # Check if metadata actually changed (beyond contentType and md5)
311
+ merged_metadata = {**(existing_doc.metadata or {}), **metadata}
312
+ if merged_metadata != existing_doc.metadata:
313
+ existing_doc.metadata = merged_metadata
314
+ updated = True
315
+
316
+ if updated:
317
+ return await self.document_repository.update(existing_doc)
318
+ return existing_doc
319
+ file_extension = self._get_extension_from_content_type_or_url(
320
+ url, content_type
321
+ )
322
+
323
+ if file_extension not in FileReader.extensions:
324
+ raise ValueError(
325
+ f"Unsupported content type/extension: {content_type}/{file_extension}"
326
+ )
327
+
328
+ # Create a temporary file with the appropriate extension
329
+ with tempfile.NamedTemporaryFile(
330
+ mode="wb", suffix=file_extension
331
+ ) as temp_file:
332
+ temp_file.write(response.content)
333
+ temp_file.flush() # Ensure content is written to disk
334
+ temp_path = Path(temp_file.name)
335
+
336
+ # Parse the content using FileReader
337
+ docling_document = FileReader.parse_file(temp_path)
338
+
339
+ # Merge metadata with contentType and md5
340
+ metadata.update({"contentType": content_type, "md5": md5_hash})
341
+
342
+ if existing_doc:
343
+ existing_doc.content = docling_document.export_to_markdown()
344
+ existing_doc.metadata = metadata
345
+ if title is not None:
346
+ existing_doc.title = title
347
+ return await self.document_repository._update_and_rechunk(
348
+ existing_doc, docling_document
349
+ )
350
+ else:
351
+ return await self._create_document_with_docling(
352
+ docling_document=docling_document,
353
+ uri=url,
354
+ title=title,
355
+ metadata=metadata,
356
+ )
357
+
358
+ def _get_extension_from_content_type_or_url(
359
+ self, url: str, content_type: str
360
+ ) -> str:
361
+ """Determine file extension from content type or URL."""
362
+ # Common content type mappings
363
+ content_type_map = {
364
+ "text/html": ".html",
365
+ "text/plain": ".txt",
366
+ "text/markdown": ".md",
367
+ "application/pdf": ".pdf",
368
+ "application/json": ".json",
369
+ "text/csv": ".csv",
370
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
371
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
372
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
373
+ }
374
+
375
+ # Try content type first
376
+ for ct, ext in content_type_map.items():
377
+ if ct in content_type:
378
+ return ext
379
+
380
+ # Try URL extension
381
+ parsed_url = urlparse(url)
382
+ path = Path(parsed_url.path)
383
+ if path.suffix:
384
+ return path.suffix.lower()
385
+
386
+ # Default to .html for web content
387
+ return ".html"
388
+
389
+ async def get_document_by_id(self, document_id: str) -> Document | None:
390
+ """Get a document by its ID.
391
+
392
+ Args:
393
+ document_id: The unique identifier of the document.
394
+
395
+ Returns:
396
+ The Document instance if found, None otherwise.
397
+ """
398
+ return await self.document_repository.get_by_id(document_id)
399
+
400
+ async def get_document_by_uri(self, uri: str) -> Document | None:
401
+ """Get a document by its URI.
402
+
403
+ Args:
404
+ uri: The URI identifier of the document.
405
+
406
+ Returns:
407
+ The Document instance if found, None otherwise.
408
+ """
409
+ return await self.document_repository.get_by_uri(uri)
410
+
411
+ async def update_document(self, document: Document) -> Document:
412
+ """Update an existing document."""
413
+ # Lazy import to avoid loading docling
414
+ from haiku.rag.utils import text_to_docling_document
415
+
416
+ # Convert content to DoclingDocument
417
+ docling_document = text_to_docling_document(document.content)
418
+
419
+ return await self.document_repository._update_and_rechunk(
420
+ document, docling_document
421
+ )
422
+
423
+ async def delete_document(self, document_id: str) -> bool:
424
+ """Delete a document by its ID."""
425
+ return await self.document_repository.delete(document_id)
426
+
427
+ async def list_documents(
428
+ self,
429
+ limit: int | None = None,
430
+ offset: int | None = None,
431
+ filter: str | None = None,
432
+ ) -> list[Document]:
433
+ """List all documents with optional pagination and filtering.
434
+
435
+ Args:
436
+ limit: Maximum number of documents to return.
437
+ offset: Number of documents to skip.
438
+ filter: Optional SQL WHERE clause to filter documents.
439
+
440
+ Returns:
441
+ List of Document instances matching the criteria.
442
+ """
443
+ return await self.document_repository.list_all(
444
+ limit=limit, offset=offset, filter=filter
445
+ )
446
+
447
+ async def search(
448
+ self,
449
+ query: str,
450
+ limit: int = 5,
451
+ search_type: str = "hybrid",
452
+ filter: str | None = None,
453
+ ) -> list[tuple[Chunk, float]]:
454
+ """Search for relevant chunks using the specified search method with optional reranking.
455
+
456
+ Args:
457
+ query: The search query string.
458
+ limit: Maximum number of results to return.
459
+ search_type: Type of search - "vector", "fts", or "hybrid" (default).
460
+ filter: Optional SQL WHERE clause to filter documents before searching chunks.
461
+
462
+ Returns:
463
+ List of (chunk, score) tuples ordered by relevance.
464
+ """
465
+ # Get reranker if available
466
+ reranker = get_reranker(config=self._config)
467
+
468
+ if reranker is None:
469
+ # No reranking - return direct search results
470
+ return await self.chunk_repository.search(query, limit, search_type, filter)
471
+
472
+ # Get more initial results (3X) for reranking
473
+ search_limit = limit * 3
474
+ search_results = await self.chunk_repository.search(
475
+ query, search_limit, search_type, filter
476
+ )
477
+
478
+ # Apply reranking
479
+ chunks = [chunk for chunk, _ in search_results]
480
+ reranked_results = await reranker.rerank(query, chunks, top_n=limit)
481
+
482
+ # Return reranked results with scores from reranker
483
+ return reranked_results
484
+
485
+ async def expand_context(
486
+ self,
487
+ search_results: list[tuple[Chunk, float]],
488
+ radius: int | None = None,
489
+ ) -> list[tuple[Chunk, float]]:
490
+ """Expand search results with adjacent chunks, merging overlapping chunks.
491
+
492
+ Args:
493
+ search_results: List of (chunk, score) tuples from search.
494
+ radius: Number of adjacent chunks to include before/after each chunk.
495
+ If None, uses config.processing.context_chunk_radius.
496
+
497
+ Returns:
498
+ List of (chunk, score) tuples with expanded and merged context chunks.
499
+ """
500
+ if radius is None:
501
+ radius = self._config.processing.context_chunk_radius
502
+ if radius == 0:
503
+ return search_results
504
+
505
+ # Group chunks by document_id to handle merging within documents
506
+ document_groups = {}
507
+ for chunk, score in search_results:
508
+ doc_id = chunk.document_id
509
+ if doc_id not in document_groups:
510
+ document_groups[doc_id] = []
511
+ document_groups[doc_id].append((chunk, score))
512
+
513
+ results = []
514
+
515
+ for doc_id, doc_chunks in document_groups.items():
516
+ # Get all expanded ranges for this document
517
+ expanded_ranges = []
518
+ for chunk, score in doc_chunks:
519
+ adjacent_chunks = await self.chunk_repository.get_adjacent_chunks(
520
+ chunk, radius
521
+ )
522
+
523
+ all_chunks = adjacent_chunks + [chunk]
524
+
525
+ # Get the range of orders for this expanded chunk
526
+ orders = [c.order for c in all_chunks]
527
+ min_order = min(orders)
528
+ max_order = max(orders)
529
+
530
+ expanded_ranges.append(
531
+ {
532
+ "original_chunk": chunk,
533
+ "score": score,
534
+ "min_order": min_order,
535
+ "max_order": max_order,
536
+ "all_chunks": sorted(all_chunks, key=lambda c: c.order),
537
+ }
538
+ )
539
+
540
+ # Merge overlapping/adjacent ranges
541
+ merged_ranges = self._merge_overlapping_ranges(expanded_ranges)
542
+
543
+ # Create merged chunks
544
+ for merged_range in merged_ranges:
545
+ combined_content_parts = [c.content for c in merged_range["all_chunks"]]
546
+
547
+ # Use the first original chunk for metadata
548
+ original_chunk = merged_range["original_chunks"][0]
549
+
550
+ merged_chunk = Chunk(
551
+ id=original_chunk.id,
552
+ document_id=original_chunk.document_id,
553
+ content="".join(combined_content_parts),
554
+ metadata=original_chunk.metadata,
555
+ document_uri=original_chunk.document_uri,
556
+ document_title=original_chunk.document_title,
557
+ document_meta=original_chunk.document_meta,
558
+ )
559
+
560
+ # Use the highest score from merged chunks
561
+ best_score = max(merged_range["scores"])
562
+ results.append((merged_chunk, best_score))
563
+
564
+ return results
565
+
566
+ def _merge_overlapping_ranges(self, expanded_ranges):
567
+ """Merge overlapping or adjacent expanded ranges."""
568
+ if not expanded_ranges:
569
+ return []
570
+
571
+ # Sort by min_order
572
+ sorted_ranges = sorted(expanded_ranges, key=lambda x: x["min_order"])
573
+ merged = []
574
+
575
+ current = {
576
+ "min_order": sorted_ranges[0]["min_order"],
577
+ "max_order": sorted_ranges[0]["max_order"],
578
+ "original_chunks": [sorted_ranges[0]["original_chunk"]],
579
+ "scores": [sorted_ranges[0]["score"]],
580
+ "all_chunks": sorted_ranges[0]["all_chunks"],
581
+ }
582
+
583
+ for range_info in sorted_ranges[1:]:
584
+ # Check if ranges overlap or are adjacent (max_order + 1 >= min_order)
585
+ if current["max_order"] >= range_info["min_order"] - 1:
586
+ # Merge ranges
587
+ current["max_order"] = max(
588
+ current["max_order"], range_info["max_order"]
589
+ )
590
+ current["original_chunks"].append(range_info["original_chunk"])
591
+ current["scores"].append(range_info["score"])
592
+
593
+ # Merge all_chunks and deduplicate by order
594
+ all_chunks_dict = {}
595
+ for chunk in current["all_chunks"] + range_info["all_chunks"]:
596
+ order = chunk.order
597
+ all_chunks_dict[order] = chunk
598
+ current["all_chunks"] = [
599
+ all_chunks_dict[order] for order in sorted(all_chunks_dict.keys())
600
+ ]
601
+ else:
602
+ # No overlap, add current to merged and start new
603
+ merged.append(current)
604
+ current = {
605
+ "min_order": range_info["min_order"],
606
+ "max_order": range_info["max_order"],
607
+ "original_chunks": [range_info["original_chunk"]],
608
+ "scores": [range_info["score"]],
609
+ "all_chunks": range_info["all_chunks"],
610
+ }
611
+
612
+ # Add the last range
613
+ merged.append(current)
614
+ return merged
615
+
616
+ async def ask(
617
+ self, question: str, cite: bool = False, system_prompt: str | None = None
618
+ ) -> str:
619
+ """Ask a question using the configured QA agent.
620
+
621
+ Args:
622
+ question: The question to ask.
623
+ cite: Whether to include citations in the response.
624
+ system_prompt: Optional custom system prompt for the QA agent.
625
+
626
+ Returns:
627
+ The generated answer as a string.
628
+ """
629
+ from haiku.rag.qa import get_qa_agent
630
+
631
+ qa_agent = get_qa_agent(
632
+ self, config=self._config, use_citations=cite, system_prompt=system_prompt
633
+ )
634
+ return await qa_agent.answer(question)
635
+
636
+ async def rebuild_database(self) -> AsyncGenerator[str, None]:
637
+ """Rebuild the database by deleting all chunks and re-indexing all documents.
638
+
639
+ For documents with URIs:
640
+ - Re-adds from source if source exists
641
+ - Re-embeds from existing content if source is missing
642
+
643
+ For documents without URIs:
644
+ - Re-creates chunks from existing content
645
+
646
+ Yields:
647
+ int: The ID of the document currently being processed
648
+ """
649
+ # Lazy import to avoid loading docling
650
+ from haiku.rag.utils import text_to_docling_document
651
+
652
+ await self.chunk_repository.delete_all()
653
+ self.store.recreate_embeddings_table()
654
+
655
+ # Update settings to current config
656
+ settings_repo = SettingsRepository(self.store)
657
+ settings_repo.save_current_settings()
658
+
659
+ documents = await self.list_documents()
660
+
661
+ for doc in documents:
662
+ assert doc.id is not None, "Document ID should not be None"
663
+ if doc.uri:
664
+ # Document has a URI - check if source is accessible
665
+ source_accessible = False
666
+ parsed_url = urlparse(doc.uri)
667
+
668
+ try:
669
+ if parsed_url.scheme == "file":
670
+ # Check if file exists
671
+ source_path = Path(parsed_url.path)
672
+ source_accessible = source_path.exists()
673
+ elif parsed_url.scheme in ("http", "https"):
674
+ # For URLs, we'll try to create and catch errors
675
+ source_accessible = True
676
+ else:
677
+ source_accessible = False
678
+ except Exception:
679
+ source_accessible = False
680
+
681
+ if source_accessible:
682
+ # Source exists - delete and recreate from source
683
+ try:
684
+ await self.delete_document(doc.id)
685
+ new_doc = await self.create_document_from_source(
686
+ source=doc.uri, metadata=doc.metadata or {}
687
+ )
688
+ # URIs always point to single files/URLs, never directories
689
+ assert isinstance(new_doc, Document)
690
+ assert new_doc.id is not None, (
691
+ "New document ID should not be None"
692
+ )
693
+ yield new_doc.id
694
+ except Exception as e:
695
+ logger.error(
696
+ "Error recreating document from source %s: %s",
697
+ doc.uri,
698
+ e,
699
+ )
700
+ continue
701
+ else:
702
+ # Source missing - re-embed from existing content
703
+ logger.warning(
704
+ "Source missing for %s, re-embedding from content", doc.uri
705
+ )
706
+ docling_document = text_to_docling_document(doc.content)
707
+ await self.chunk_repository.create_chunks_for_document(
708
+ doc.id, docling_document
709
+ )
710
+ yield doc.id
711
+ else:
712
+ # Document without URI - re-create chunks from existing content
713
+ docling_document = text_to_docling_document(doc.content)
714
+ await self.chunk_repository.create_chunks_for_document(
715
+ doc.id, docling_document
716
+ )
717
+ yield doc.id
718
+
719
+ # Final maintenance: centralized vacuum to curb disk usage
720
+ try:
721
+ await self.store.vacuum()
722
+ except Exception:
723
+ pass
724
+
725
+ async def vacuum(self) -> None:
726
+ """Optimize and clean up old versions across all tables."""
727
+ await self.store.vacuum()
728
+
729
+ def close(self):
730
+ """Close the underlying store connection."""
731
+ self.store.close()