haiku.rag 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/app.py CHANGED
@@ -40,7 +40,7 @@ class HaikuRAGApp:
40
40
  f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
41
41
  )
42
42
 
43
- async def get_document(self, doc_id: int):
43
+ async def get_document(self, doc_id: str):
44
44
  async with HaikuRAG(db_path=self.db_path) as self.client:
45
45
  doc = await self.client.get_document_by_id(doc_id)
46
46
  if doc is None:
@@ -48,14 +48,14 @@ class HaikuRAGApp:
48
48
  return
49
49
  self._rich_print_document(doc, truncate=False)
50
50
 
51
- async def delete_document(self, doc_id: int):
51
+ async def delete_document(self, doc_id: str):
52
52
  async with HaikuRAG(db_path=self.db_path) as self.client:
53
53
  await self.client.delete_document(doc_id)
54
54
  self.console.print(f"[b]Document {doc_id} deleted successfully.[/b]")
55
55
 
56
- async def search(self, query: str, limit: int = 5, k: int = 60):
56
+ async def search(self, query: str, limit: int = 5):
57
57
  async with HaikuRAG(db_path=self.db_path) as self.client:
58
- results = await self.client.search(query, limit=limit, k=k)
58
+ results = await self.client.search(query, limit=limit)
59
59
  if not results:
60
60
  self.console.print("[red]No results found.[/red]")
61
61
  return
haiku/rag/cli.py CHANGED
@@ -8,6 +8,7 @@ from rich.console import Console
8
8
 
9
9
  from haiku.rag.app import HaikuRAGApp
10
10
  from haiku.rag.config import Config
11
+ from haiku.rag.migration import migrate_sqlite_to_lancedb
11
12
  from haiku.rag.utils import is_up_to_date
12
13
 
13
14
  if not Config.ENV == "development":
@@ -47,7 +48,7 @@ def main(
47
48
  help="Show version and exit",
48
49
  ),
49
50
  ):
50
- """haiku.rag CLI - SQLite-based RAG system"""
51
+ """haiku.rag CLI - Vector database RAG system"""
51
52
  # Run version check before any command
52
53
  asyncio.run(check_version())
53
54
 
@@ -55,9 +56,9 @@ def main(
55
56
  @cli.command("list", help="List all stored documents")
56
57
  def list_documents(
57
58
  db: Path = typer.Option(
58
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
59
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
59
60
  "--db",
60
- help="Path to the SQLite database file",
61
+ help="Path to the LanceDB database file",
61
62
  ),
62
63
  ):
63
64
  app = HaikuRAGApp(db_path=db)
@@ -70,9 +71,9 @@ def add_document_text(
70
71
  help="The text content of the document to add",
71
72
  ),
72
73
  db: Path = typer.Option(
73
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
74
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
74
75
  "--db",
75
- help="Path to the SQLite database file",
76
+ help="Path to the LanceDB database file",
76
77
  ),
77
78
  ):
78
79
  app = HaikuRAGApp(db_path=db)
@@ -85,9 +86,9 @@ def add_document_src(
85
86
  help="The file path or URL of the document to add",
86
87
  ),
87
88
  db: Path = typer.Option(
88
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
89
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
89
90
  "--db",
90
- help="Path to the SQLite database file",
91
+ help="Path to the LanceDB database file",
91
92
  ),
92
93
  ):
93
94
  app = HaikuRAGApp(db_path=db)
@@ -96,13 +97,13 @@ def add_document_src(
96
97
 
97
98
  @cli.command("get", help="Get and display a document by its ID")
98
99
  def get_document(
99
- doc_id: int = typer.Argument(
100
+ doc_id: str = typer.Argument(
100
101
  help="The ID of the document to get",
101
102
  ),
102
103
  db: Path = typer.Option(
103
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
104
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
104
105
  "--db",
105
- help="Path to the SQLite database file",
106
+ help="Path to the LanceDB database file",
106
107
  ),
107
108
  ):
108
109
  app = HaikuRAGApp(db_path=db)
@@ -111,13 +112,13 @@ def get_document(
111
112
 
112
113
  @cli.command("delete", help="Delete a document by its ID")
113
114
  def delete_document(
114
- doc_id: int = typer.Argument(
115
+ doc_id: str = typer.Argument(
115
116
  help="The ID of the document to delete",
116
117
  ),
117
118
  db: Path = typer.Option(
118
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
119
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
119
120
  "--db",
120
- help="Path to the SQLite database file",
121
+ help="Path to the LanceDB database file",
121
122
  ),
122
123
  ):
123
124
  app = HaikuRAGApp(db_path=db)
@@ -135,19 +136,14 @@ def search(
135
136
  "-l",
136
137
  help="Maximum number of results to return",
137
138
  ),
138
- k: int = typer.Option(
139
- 60,
140
- "--k",
141
- help="Reciprocal Rank Fusion k parameter",
142
- ),
143
139
  db: Path = typer.Option(
144
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
140
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
145
141
  "--db",
146
- help="Path to the SQLite database file",
142
+ help="Path to the LanceDB database file",
147
143
  ),
148
144
  ):
149
145
  app = HaikuRAGApp(db_path=db)
150
- asyncio.run(app.search(query=query, limit=limit, k=k))
146
+ asyncio.run(app.search(query=query, limit=limit))
151
147
 
152
148
 
153
149
  @cli.command("ask", help="Ask a question using the QA agent")
@@ -156,9 +152,9 @@ def ask(
156
152
  help="The question to ask",
157
153
  ),
158
154
  db: Path = typer.Option(
159
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
155
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
160
156
  "--db",
161
- help="Path to the SQLite database file",
157
+ help="Path to the LanceDB database file",
162
158
  ),
163
159
  cite: bool = typer.Option(
164
160
  False,
@@ -182,9 +178,9 @@ def settings():
182
178
  )
183
179
  def rebuild(
184
180
  db: Path = typer.Option(
185
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
181
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
186
182
  "--db",
187
- help="Path to the SQLite database file",
183
+ help="Path to the LanceDB database file",
188
184
  ),
189
185
  ):
190
186
  app = HaikuRAGApp(db_path=db)
@@ -196,9 +192,9 @@ def rebuild(
196
192
  )
197
193
  def serve(
198
194
  db: Path = typer.Option(
199
- Config.DEFAULT_DATA_DIR / "haiku.rag.sqlite",
195
+ Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
200
196
  "--db",
201
- help="Path to the SQLite database file",
197
+ help="Path to the LanceDB database file",
202
198
  ),
203
199
  stdio: bool = typer.Option(
204
200
  False,
@@ -227,5 +223,20 @@ def serve(
227
223
  asyncio.run(app.serve(transport=transport))
228
224
 
229
225
 
226
+ @cli.command("migrate", help="Migrate an SQLite database to LanceDB")
227
+ def migrate(
228
+ sqlite_path: Path = typer.Argument(
229
+ help="Path to the SQLite database file to migrate",
230
+ ),
231
+ ):
232
+ # Generate LanceDB path in same parent directory
233
+ lancedb_path = sqlite_path.parent / (sqlite_path.stem + ".lancedb")
234
+
235
+ success = asyncio.run(migrate_sqlite_to_lancedb(sqlite_path, lancedb_path))
236
+
237
+ if not success:
238
+ raise typer.Exit(1)
239
+
240
+
230
241
  if __name__ == "__main__":
231
242
  cli()
haiku/rag/client.py CHANGED
@@ -3,7 +3,6 @@ import mimetypes
3
3
  import tempfile
4
4
  from collections.abc import AsyncGenerator
5
5
  from pathlib import Path
6
- from typing import Literal
7
6
  from urllib.parse import urlparse
8
7
 
9
8
  import httpx
@@ -16,6 +15,7 @@ from haiku.rag.store.models.chunk import Chunk
16
15
  from haiku.rag.store.models.document import Document
17
16
  from haiku.rag.store.repositories.chunk import ChunkRepository
18
17
  from haiku.rag.store.repositories.document import DocumentRepository
18
+ from haiku.rag.store.repositories.settings import SettingsRepository
19
19
  from haiku.rag.utils import text_to_docling_document
20
20
 
21
21
 
@@ -24,19 +24,17 @@ class HaikuRAG:
24
24
 
25
25
  def __init__(
26
26
  self,
27
- db_path: Path | Literal[":memory:"] = Config.DEFAULT_DATA_DIR
28
- / "haiku.rag.sqlite",
27
+ db_path: Path = Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
29
28
  skip_validation: bool = False,
30
29
  ):
31
30
  """Initialize the RAG client with a database path.
32
31
 
33
32
  Args:
34
- db_path: Path to the SQLite database file or ":memory:" for in-memory database.
33
+ db_path: Path to the database file.
35
34
  skip_validation: Whether to skip configuration validation on database load.
36
35
  """
37
- if isinstance(db_path, Path):
38
- if not db_path.parent.exists():
39
- Path.mkdir(db_path.parent, parents=True)
36
+ if not db_path.parent.exists():
37
+ Path.mkdir(db_path.parent, parents=True)
40
38
  self.store = Store(db_path, skip_validation=skip_validation)
41
39
  self.document_repository = DocumentRepository(self.store)
42
40
  self.chunk_repository = ChunkRepository(self.store)
@@ -269,7 +267,7 @@ class HaikuRAG:
269
267
  # Default to .html for web content
270
268
  return ".html"
271
269
 
272
- async def get_document_by_id(self, document_id: int) -> Document | None:
270
+ async def get_document_by_id(self, document_id: str) -> Document | None:
273
271
  """Get a document by its ID.
274
272
 
275
273
  Args:
@@ -300,7 +298,7 @@ class HaikuRAG:
300
298
  document, docling_document
301
299
  )
302
300
 
303
- async def delete_document(self, document_id: int) -> bool:
301
+ async def delete_document(self, document_id: str) -> bool:
304
302
  """Delete a document by its ID."""
305
303
  return await self.document_repository.delete(document_id)
306
304
 
@@ -319,14 +317,14 @@ class HaikuRAG:
319
317
  return await self.document_repository.list_all(limit=limit, offset=offset)
320
318
 
321
319
  async def search(
322
- self, query: str, limit: int = 5, k: int = 60
320
+ self, query: str, limit: int = 5, search_type: str = "hybrid"
323
321
  ) -> list[tuple[Chunk, float]]:
324
- """Search for relevant chunks using hybrid search (vector similarity + full-text search) with reranking.
322
+ """Search for relevant chunks using the specified search method with optional reranking.
325
323
 
326
324
  Args:
327
325
  query: The search query string.
328
326
  limit: Maximum number of results to return.
329
- k: Parameter for Reciprocal Rank Fusion (default: 60).
327
+ search_type: Type of search - "vector", "fts", or "hybrid" (default).
330
328
 
331
329
  Returns:
332
330
  List of (chunk, score) tuples ordered by relevance.
@@ -335,12 +333,15 @@ class HaikuRAG:
335
333
  reranker = get_reranker()
336
334
 
337
335
  if reranker is None:
338
- return await self.chunk_repository.search_chunks_hybrid(query, limit, k)
336
+ # No reranking - return direct search results
337
+ return await self.chunk_repository.search(query, limit, search_type)
339
338
 
340
339
  # Get more initial results (3X) for reranking
341
- search_results = await self.chunk_repository.search_chunks_hybrid(
342
- query, limit * 3, k
340
+ search_limit = limit * 3
341
+ search_results = await self.chunk_repository.search(
342
+ query, search_limit, search_type
343
343
  )
344
+
344
345
  # Apply reranking
345
346
  chunks = [chunk for chunk, _ in search_results]
346
347
  reranked_results = await reranker.rerank(query, chunks, top_n=limit)
@@ -493,7 +494,7 @@ class HaikuRAG:
493
494
  qa_agent = get_qa_agent(self, use_citations=cite)
494
495
  return await qa_agent.answer(question)
495
496
 
496
- async def rebuild_database(self) -> AsyncGenerator[int, None]:
497
+ async def rebuild_database(self) -> AsyncGenerator[str, None]:
497
498
  """Rebuild the database by deleting all chunks and re-indexing all documents.
498
499
 
499
500
  For documents with URIs:
@@ -510,10 +511,8 @@ class HaikuRAG:
510
511
  self.store.recreate_embeddings_table()
511
512
 
512
513
  # Update settings to current config
513
- from haiku.rag.store.repositories.settings import SettingsRepository
514
-
515
514
  settings_repo = SettingsRepository(self.store)
516
- settings_repo.save()
515
+ settings_repo.save_current_settings()
517
516
 
518
517
  documents = await self.list_documents()
519
518
 
@@ -547,13 +546,10 @@ class HaikuRAG:
547
546
  # Document without URI - re-create chunks from existing content
548
547
  docling_document = text_to_docling_document(doc.content)
549
548
  await self.chunk_repository.create_chunks_for_document(
550
- doc.id, docling_document, commit=False
549
+ doc.id, docling_document
551
550
  )
552
551
  yield doc.id
553
552
 
554
- if self.store._connection:
555
- self.store._connection.commit()
556
-
557
553
  def close(self):
558
554
  """Close the underlying store connection."""
559
555
  self.store.close()
haiku/rag/config.py CHANGED
@@ -12,6 +12,10 @@ load_dotenv()
12
12
  class AppConfig(BaseModel):
13
13
  ENV: str = "production"
14
14
 
15
+ LANCEDB_API_KEY: str = ""
16
+ LANCEDB_URI: str = ""
17
+ LANCEDB_REGION: str = ""
18
+
15
19
  DEFAULT_DATA_DIR: Path = get_default_data_dir()
16
20
  MONITOR_DIRECTORIES: list[Path] = []
17
21
 
@@ -19,8 +23,8 @@ class AppConfig(BaseModel):
19
23
  EMBEDDINGS_MODEL: str = "mxbai-embed-large"
20
24
  EMBEDDINGS_VECTOR_DIM: int = 1024
21
25
 
22
- RERANK_PROVIDER: str = "ollama"
23
- RERANK_MODEL: str = "qwen3"
26
+ RERANK_PROVIDER: str = ""
27
+ RERANK_MODEL: str = ""
24
28
 
25
29
  QA_PROVIDER: str = "ollama"
26
30
  QA_MODEL: str = "qwen3"
haiku/rag/logging.py CHANGED
@@ -3,6 +3,11 @@ import logging
3
3
  from rich.console import Console
4
4
  from rich.logging import RichHandler
5
5
 
6
+ logging.basicConfig(level=logging.DEBUG)
7
+ logging.getLogger("httpx").setLevel(logging.WARNING)
8
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
9
+ logging.getLogger("docling").setLevel(logging.WARNING)
10
+
6
11
 
7
12
  def get_logger() -> logging.Logger:
8
13
  logger = logging.getLogger("haiku.rag")
haiku/rag/mcp.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from pathlib import Path
2
- from typing import Any, Literal
2
+ from typing import Any
3
3
 
4
4
  from fastmcp import FastMCP
5
5
  from pydantic import BaseModel
@@ -8,13 +8,13 @@ from haiku.rag.client import HaikuRAG
8
8
 
9
9
 
10
10
  class SearchResult(BaseModel):
11
- document_id: int
11
+ document_id: str
12
12
  content: str
13
13
  score: float
14
14
 
15
15
 
16
16
  class DocumentResult(BaseModel):
17
- id: int | None
17
+ id: str | None
18
18
  content: str
19
19
  uri: str | None = None
20
20
  metadata: dict[str, Any] = {}
@@ -22,14 +22,14 @@ class DocumentResult(BaseModel):
22
22
  updated_at: str
23
23
 
24
24
 
25
- def create_mcp_server(db_path: Path | Literal[":memory:"]) -> FastMCP:
25
+ def create_mcp_server(db_path: Path) -> FastMCP:
26
26
  """Create an MCP server with the specified database path."""
27
27
  mcp = FastMCP("haiku-rag")
28
28
 
29
29
  @mcp.tool()
30
30
  async def add_document_from_file(
31
31
  file_path: str, metadata: dict[str, Any] | None = None
32
- ) -> int | None:
32
+ ) -> str | None:
33
33
  """Add a document to the RAG system from a file path."""
34
34
  try:
35
35
  async with HaikuRAG(db_path) as rag:
@@ -43,7 +43,7 @@ def create_mcp_server(db_path: Path | Literal[":memory:"]) -> FastMCP:
43
43
  @mcp.tool()
44
44
  async def add_document_from_url(
45
45
  url: str, metadata: dict[str, Any] | None = None
46
- ) -> int | None:
46
+ ) -> str | None:
47
47
  """Add a document to the RAG system from a URL."""
48
48
  try:
49
49
  async with HaikuRAG(db_path) as rag:
@@ -55,7 +55,7 @@ def create_mcp_server(db_path: Path | Literal[":memory:"]) -> FastMCP:
55
55
  @mcp.tool()
56
56
  async def add_document_from_text(
57
57
  content: str, uri: str | None = None, metadata: dict[str, Any] | None = None
58
- ) -> int | None:
58
+ ) -> str | None:
59
59
  """Add a document to the RAG system from text content."""
60
60
  try:
61
61
  async with HaikuRAG(db_path) as rag:
@@ -73,6 +73,9 @@ def create_mcp_server(db_path: Path | Literal[":memory:"]) -> FastMCP:
73
73
 
74
74
  search_results = []
75
75
  for chunk, score in results:
76
+ assert chunk.document_id is not None, (
77
+ "Chunk document_id should not be None in search results"
78
+ )
76
79
  search_results.append(
77
80
  SearchResult(
78
81
  document_id=chunk.document_id,
@@ -86,7 +89,7 @@ def create_mcp_server(db_path: Path | Literal[":memory:"]) -> FastMCP:
86
89
  return []
87
90
 
88
91
  @mcp.tool()
89
- async def get_document(document_id: int) -> DocumentResult | None:
92
+ async def get_document(document_id: str) -> DocumentResult | None:
90
93
  """Get a document by its ID."""
91
94
  try:
92
95
  async with HaikuRAG(db_path) as rag:
@@ -130,7 +133,7 @@ def create_mcp_server(db_path: Path | Literal[":memory:"]) -> FastMCP:
130
133
  return []
131
134
 
132
135
  @mcp.tool()
133
- async def delete_document(document_id: int) -> bool:
136
+ async def delete_document(document_id: str) -> bool:
134
137
  """Delete a document by its ID."""
135
138
  try:
136
139
  async with HaikuRAG(db_path) as rag: