haiku.rag 0.10.2__py3-none-any.whl → 0.19.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +172 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/METADATA +79 -51
- haiku_rag-0.19.3.dist-info/RECORD +6 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/WHEEL +1 -1
- haiku/rag/__init__.py +0 -0
- haiku/rag/app.py +0 -437
- haiku/rag/chunker.py +0 -51
- haiku/rag/cli.py +0 -466
- haiku/rag/client.py +0 -605
- haiku/rag/config.py +0 -81
- haiku/rag/embeddings/__init__.py +0 -35
- haiku/rag/embeddings/base.py +0 -15
- haiku/rag/embeddings/ollama.py +0 -17
- haiku/rag/embeddings/openai.py +0 -16
- haiku/rag/embeddings/vllm.py +0 -19
- haiku/rag/embeddings/voyageai.py +0 -17
- haiku/rag/logging.py +0 -56
- haiku/rag/mcp.py +0 -156
- haiku/rag/migration.py +0 -316
- haiku/rag/monitor.py +0 -73
- haiku/rag/qa/__init__.py +0 -15
- haiku/rag/qa/agent.py +0 -91
- haiku/rag/qa/prompts.py +0 -60
- haiku/rag/reader.py +0 -115
- haiku/rag/reranking/__init__.py +0 -34
- haiku/rag/reranking/base.py +0 -13
- haiku/rag/reranking/cohere.py +0 -34
- haiku/rag/reranking/mxbai.py +0 -28
- haiku/rag/reranking/vllm.py +0 -44
- haiku/rag/research/__init__.py +0 -20
- haiku/rag/research/common.py +0 -53
- haiku/rag/research/dependencies.py +0 -47
- haiku/rag/research/graph.py +0 -29
- haiku/rag/research/models.py +0 -70
- haiku/rag/research/nodes/evaluate.py +0 -80
- haiku/rag/research/nodes/plan.py +0 -63
- haiku/rag/research/nodes/search.py +0 -93
- haiku/rag/research/nodes/synthesize.py +0 -51
- haiku/rag/research/prompts.py +0 -114
- haiku/rag/research/state.py +0 -25
- haiku/rag/store/__init__.py +0 -4
- haiku/rag/store/engine.py +0 -269
- haiku/rag/store/models/__init__.py +0 -4
- haiku/rag/store/models/chunk.py +0 -17
- haiku/rag/store/models/document.py +0 -17
- haiku/rag/store/repositories/__init__.py +0 -9
- haiku/rag/store/repositories/chunk.py +0 -424
- haiku/rag/store/repositories/document.py +0 -237
- haiku/rag/store/repositories/settings.py +0 -155
- haiku/rag/store/upgrades/__init__.py +0 -62
- haiku/rag/store/upgrades/v0_10_1.py +0 -64
- haiku/rag/store/upgrades/v0_9_3.py +0 -112
- haiku/rag/utils.py +0 -199
- haiku_rag-0.10.2.dist-info/RECORD +0 -54
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/licenses/LICENSE +0 -0
haiku/rag/embeddings/__init__.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
from haiku.rag.config import Config
|
|
2
|
-
from haiku.rag.embeddings.base import EmbedderBase
|
|
3
|
-
from haiku.rag.embeddings.ollama import Embedder as OllamaEmbedder
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def get_embedder() -> EmbedderBase:
|
|
7
|
-
"""
|
|
8
|
-
Factory function to get the appropriate embedder based on the configuration.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
if Config.EMBEDDINGS_PROVIDER == "ollama":
|
|
12
|
-
return OllamaEmbedder(Config.EMBEDDINGS_MODEL, Config.EMBEDDINGS_VECTOR_DIM)
|
|
13
|
-
|
|
14
|
-
if Config.EMBEDDINGS_PROVIDER == "voyageai":
|
|
15
|
-
try:
|
|
16
|
-
from haiku.rag.embeddings.voyageai import Embedder as VoyageAIEmbedder
|
|
17
|
-
except ImportError:
|
|
18
|
-
raise ImportError(
|
|
19
|
-
"VoyageAI embedder requires the 'voyageai' package. "
|
|
20
|
-
"Please install haiku.rag with the 'voyageai' extra: "
|
|
21
|
-
"uv pip install haiku.rag[voyageai]"
|
|
22
|
-
)
|
|
23
|
-
return VoyageAIEmbedder(Config.EMBEDDINGS_MODEL, Config.EMBEDDINGS_VECTOR_DIM)
|
|
24
|
-
|
|
25
|
-
if Config.EMBEDDINGS_PROVIDER == "openai":
|
|
26
|
-
from haiku.rag.embeddings.openai import Embedder as OpenAIEmbedder
|
|
27
|
-
|
|
28
|
-
return OpenAIEmbedder(Config.EMBEDDINGS_MODEL, Config.EMBEDDINGS_VECTOR_DIM)
|
|
29
|
-
|
|
30
|
-
if Config.EMBEDDINGS_PROVIDER == "vllm":
|
|
31
|
-
from haiku.rag.embeddings.vllm import Embedder as VllmEmbedder
|
|
32
|
-
|
|
33
|
-
return VllmEmbedder(Config.EMBEDDINGS_MODEL, Config.EMBEDDINGS_VECTOR_DIM)
|
|
34
|
-
|
|
35
|
-
raise ValueError(f"Unsupported embedding provider: {Config.EMBEDDINGS_PROVIDER}")
|
haiku/rag/embeddings/base.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from haiku.rag.config import Config
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class EmbedderBase:
|
|
5
|
-
_model: str = Config.EMBEDDINGS_MODEL
|
|
6
|
-
_vector_dim: int = Config.EMBEDDINGS_VECTOR_DIM
|
|
7
|
-
|
|
8
|
-
def __init__(self, model: str, vector_dim: int):
|
|
9
|
-
self._model = model
|
|
10
|
-
self._vector_dim = vector_dim
|
|
11
|
-
|
|
12
|
-
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
13
|
-
raise NotImplementedError(
|
|
14
|
-
"Embedder is an abstract class. Please implement the embed method in a subclass."
|
|
15
|
-
)
|
haiku/rag/embeddings/ollama.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from openai import AsyncOpenAI
|
|
2
|
-
|
|
3
|
-
from haiku.rag.config import Config
|
|
4
|
-
from haiku.rag.embeddings.base import EmbedderBase
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class Embedder(EmbedderBase):
|
|
8
|
-
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
9
|
-
client = AsyncOpenAI(base_url=f"{Config.OLLAMA_BASE_URL}/v1", api_key="dummy")
|
|
10
|
-
response = await client.embeddings.create(
|
|
11
|
-
model=self._model,
|
|
12
|
-
input=text,
|
|
13
|
-
)
|
|
14
|
-
if isinstance(text, str):
|
|
15
|
-
return response.data[0].embedding
|
|
16
|
-
else:
|
|
17
|
-
return [item.embedding for item in response.data]
|
haiku/rag/embeddings/openai.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
from openai import AsyncOpenAI
|
|
2
|
-
|
|
3
|
-
from haiku.rag.embeddings.base import EmbedderBase
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Embedder(EmbedderBase):
|
|
7
|
-
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
8
|
-
client = AsyncOpenAI()
|
|
9
|
-
response = await client.embeddings.create(
|
|
10
|
-
model=self._model,
|
|
11
|
-
input=text,
|
|
12
|
-
)
|
|
13
|
-
if isinstance(text, str):
|
|
14
|
-
return response.data[0].embedding
|
|
15
|
-
else:
|
|
16
|
-
return [item.embedding for item in response.data]
|
haiku/rag/embeddings/vllm.py
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
from openai import AsyncOpenAI
|
|
2
|
-
|
|
3
|
-
from haiku.rag.config import Config
|
|
4
|
-
from haiku.rag.embeddings.base import EmbedderBase
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class Embedder(EmbedderBase):
|
|
8
|
-
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
9
|
-
client = AsyncOpenAI(
|
|
10
|
-
base_url=f"{Config.VLLM_EMBEDDINGS_BASE_URL}/v1", api_key="dummy"
|
|
11
|
-
)
|
|
12
|
-
response = await client.embeddings.create(
|
|
13
|
-
model=self._model,
|
|
14
|
-
input=text,
|
|
15
|
-
)
|
|
16
|
-
if isinstance(text, str):
|
|
17
|
-
return response.data[0].embedding
|
|
18
|
-
else:
|
|
19
|
-
return [item.embedding for item in response.data]
|
haiku/rag/embeddings/voyageai.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
try:
|
|
2
|
-
from voyageai.client import Client # type: ignore
|
|
3
|
-
|
|
4
|
-
from haiku.rag.embeddings.base import EmbedderBase
|
|
5
|
-
|
|
6
|
-
class Embedder(EmbedderBase):
|
|
7
|
-
async def embed(self, text: str | list[str]) -> list[float] | list[list[float]]:
|
|
8
|
-
client = Client()
|
|
9
|
-
if isinstance(text, str):
|
|
10
|
-
res = client.embed([text], model=self._model, output_dtype="float")
|
|
11
|
-
return res.embeddings[0] # type: ignore[return-value]
|
|
12
|
-
else:
|
|
13
|
-
res = client.embed(text, model=self._model, output_dtype="float")
|
|
14
|
-
return res.embeddings # type: ignore[return-value]
|
|
15
|
-
|
|
16
|
-
except ImportError:
|
|
17
|
-
pass
|
haiku/rag/logging.py
DELETED
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import warnings
|
|
3
|
-
|
|
4
|
-
from rich.console import Console
|
|
5
|
-
from rich.logging import RichHandler
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def get_logger() -> logging.Logger:
|
|
9
|
-
"""Return the library logger configured with a Rich handler."""
|
|
10
|
-
logger = logging.getLogger("haiku.rag")
|
|
11
|
-
|
|
12
|
-
handler = RichHandler(
|
|
13
|
-
console=Console(stderr=True),
|
|
14
|
-
rich_tracebacks=True,
|
|
15
|
-
)
|
|
16
|
-
formatter = logging.Formatter("%(message)s")
|
|
17
|
-
handler.setFormatter(formatter)
|
|
18
|
-
|
|
19
|
-
logger.setLevel(logging.INFO)
|
|
20
|
-
|
|
21
|
-
# Remove any existing handlers to avoid duplicates on reconfiguration
|
|
22
|
-
for hdlr in logger.handlers[:]:
|
|
23
|
-
logger.removeHandler(hdlr)
|
|
24
|
-
|
|
25
|
-
logger.addHandler(handler)
|
|
26
|
-
# Do not let messages propagate to the root logger
|
|
27
|
-
logger.propagate = False
|
|
28
|
-
return logger
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def configure_cli_logging(level: int = logging.INFO) -> logging.Logger:
|
|
32
|
-
"""Configure logging for CLI runs.
|
|
33
|
-
|
|
34
|
-
- Silence ALL non-haiku.rag loggers by detaching root handlers and setting
|
|
35
|
-
their level to ERROR.
|
|
36
|
-
- Attach a Rich handler only to the "haiku.rag" logger.
|
|
37
|
-
- Prevent propagation so only our logger prints in the CLI.
|
|
38
|
-
"""
|
|
39
|
-
# Silence root logger completely
|
|
40
|
-
root = logging.getLogger()
|
|
41
|
-
for hdlr in root.handlers[:]:
|
|
42
|
-
root.removeHandler(hdlr)
|
|
43
|
-
root.setLevel(logging.ERROR)
|
|
44
|
-
|
|
45
|
-
# Optionally silence some commonly noisy libraries explicitly as a safeguard
|
|
46
|
-
for noisy in ("httpx", "httpcore", "docling", "urllib3", "asyncio"):
|
|
47
|
-
logging.getLogger(noisy).setLevel(logging.ERROR)
|
|
48
|
-
logging.getLogger(noisy).propagate = False
|
|
49
|
-
|
|
50
|
-
# Configure and return our app logger
|
|
51
|
-
logger = get_logger()
|
|
52
|
-
logger.setLevel(level)
|
|
53
|
-
logger.propagate = False
|
|
54
|
-
|
|
55
|
-
warnings.filterwarnings("ignore")
|
|
56
|
-
return logger
|
haiku/rag/mcp.py
DELETED
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
from fastmcp import FastMCP
|
|
5
|
-
from pydantic import BaseModel
|
|
6
|
-
|
|
7
|
-
from haiku.rag.client import HaikuRAG
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class SearchResult(BaseModel):
|
|
11
|
-
document_id: str
|
|
12
|
-
content: str
|
|
13
|
-
score: float
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class DocumentResult(BaseModel):
|
|
17
|
-
id: str | None
|
|
18
|
-
content: str
|
|
19
|
-
uri: str | None = None
|
|
20
|
-
title: str | None = None
|
|
21
|
-
metadata: dict[str, Any] = {}
|
|
22
|
-
created_at: str
|
|
23
|
-
updated_at: str
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def create_mcp_server(db_path: Path) -> FastMCP:
|
|
27
|
-
"""Create an MCP server with the specified database path."""
|
|
28
|
-
mcp = FastMCP("haiku-rag")
|
|
29
|
-
|
|
30
|
-
@mcp.tool()
|
|
31
|
-
async def add_document_from_file(
|
|
32
|
-
file_path: str,
|
|
33
|
-
metadata: dict[str, Any] | None = None,
|
|
34
|
-
title: str | None = None,
|
|
35
|
-
) -> str | None:
|
|
36
|
-
"""Add a document to the RAG system from a file path."""
|
|
37
|
-
try:
|
|
38
|
-
async with HaikuRAG(db_path) as rag:
|
|
39
|
-
document = await rag.create_document_from_source(
|
|
40
|
-
Path(file_path), title=title, metadata=metadata or {}
|
|
41
|
-
)
|
|
42
|
-
return document.id
|
|
43
|
-
except Exception:
|
|
44
|
-
return None
|
|
45
|
-
|
|
46
|
-
@mcp.tool()
|
|
47
|
-
async def add_document_from_url(
|
|
48
|
-
url: str, metadata: dict[str, Any] | None = None, title: str | None = None
|
|
49
|
-
) -> str | None:
|
|
50
|
-
"""Add a document to the RAG system from a URL."""
|
|
51
|
-
try:
|
|
52
|
-
async with HaikuRAG(db_path) as rag:
|
|
53
|
-
document = await rag.create_document_from_source(
|
|
54
|
-
url, title=title, metadata=metadata or {}
|
|
55
|
-
)
|
|
56
|
-
return document.id
|
|
57
|
-
except Exception:
|
|
58
|
-
return None
|
|
59
|
-
|
|
60
|
-
@mcp.tool()
|
|
61
|
-
async def add_document_from_text(
|
|
62
|
-
content: str,
|
|
63
|
-
uri: str | None = None,
|
|
64
|
-
metadata: dict[str, Any] | None = None,
|
|
65
|
-
title: str | None = None,
|
|
66
|
-
) -> str | None:
|
|
67
|
-
"""Add a document to the RAG system from text content."""
|
|
68
|
-
try:
|
|
69
|
-
async with HaikuRAG(db_path) as rag:
|
|
70
|
-
document = await rag.create_document(
|
|
71
|
-
content, uri, title=title, metadata=metadata or {}
|
|
72
|
-
)
|
|
73
|
-
return document.id
|
|
74
|
-
except Exception:
|
|
75
|
-
return None
|
|
76
|
-
|
|
77
|
-
@mcp.tool()
|
|
78
|
-
async def search_documents(query: str, limit: int = 5) -> list[SearchResult]:
|
|
79
|
-
"""Search the RAG system for documents using hybrid search (vector similarity + full-text search)."""
|
|
80
|
-
try:
|
|
81
|
-
async with HaikuRAG(db_path) as rag:
|
|
82
|
-
results = await rag.search(query, limit)
|
|
83
|
-
|
|
84
|
-
search_results = []
|
|
85
|
-
for chunk, score in results:
|
|
86
|
-
assert chunk.document_id is not None, (
|
|
87
|
-
"Chunk document_id should not be None in search results"
|
|
88
|
-
)
|
|
89
|
-
search_results.append(
|
|
90
|
-
SearchResult(
|
|
91
|
-
document_id=chunk.document_id,
|
|
92
|
-
content=chunk.content,
|
|
93
|
-
score=score,
|
|
94
|
-
)
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
return search_results
|
|
98
|
-
except Exception:
|
|
99
|
-
return []
|
|
100
|
-
|
|
101
|
-
@mcp.tool()
|
|
102
|
-
async def get_document(document_id: str) -> DocumentResult | None:
|
|
103
|
-
"""Get a document by its ID."""
|
|
104
|
-
try:
|
|
105
|
-
async with HaikuRAG(db_path) as rag:
|
|
106
|
-
document = await rag.get_document_by_id(document_id)
|
|
107
|
-
|
|
108
|
-
if document is None:
|
|
109
|
-
return None
|
|
110
|
-
|
|
111
|
-
return DocumentResult(
|
|
112
|
-
id=document.id,
|
|
113
|
-
content=document.content,
|
|
114
|
-
uri=document.uri,
|
|
115
|
-
title=document.title,
|
|
116
|
-
metadata=document.metadata,
|
|
117
|
-
created_at=str(document.created_at),
|
|
118
|
-
updated_at=str(document.updated_at),
|
|
119
|
-
)
|
|
120
|
-
except Exception:
|
|
121
|
-
return None
|
|
122
|
-
|
|
123
|
-
@mcp.tool()
|
|
124
|
-
async def list_documents(
|
|
125
|
-
limit: int | None = None, offset: int | None = None
|
|
126
|
-
) -> list[DocumentResult]:
|
|
127
|
-
"""List all documents with optional pagination."""
|
|
128
|
-
try:
|
|
129
|
-
async with HaikuRAG(db_path) as rag:
|
|
130
|
-
documents = await rag.list_documents(limit, offset)
|
|
131
|
-
|
|
132
|
-
return [
|
|
133
|
-
DocumentResult(
|
|
134
|
-
id=doc.id,
|
|
135
|
-
content=doc.content,
|
|
136
|
-
uri=doc.uri,
|
|
137
|
-
title=doc.title,
|
|
138
|
-
metadata=doc.metadata,
|
|
139
|
-
created_at=str(doc.created_at),
|
|
140
|
-
updated_at=str(doc.updated_at),
|
|
141
|
-
)
|
|
142
|
-
for doc in documents
|
|
143
|
-
]
|
|
144
|
-
except Exception:
|
|
145
|
-
return []
|
|
146
|
-
|
|
147
|
-
@mcp.tool()
|
|
148
|
-
async def delete_document(document_id: str) -> bool:
|
|
149
|
-
"""Delete a document by its ID."""
|
|
150
|
-
try:
|
|
151
|
-
async with HaikuRAG(db_path) as rag:
|
|
152
|
-
return await rag.delete_document(document_id)
|
|
153
|
-
except Exception:
|
|
154
|
-
return False
|
|
155
|
-
|
|
156
|
-
return mcp
|
haiku/rag/migration.py
DELETED
|
@@ -1,316 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import sqlite3
|
|
3
|
-
import struct
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
|
|
7
|
-
from rich.console import Console
|
|
8
|
-
from rich.progress import Progress, TaskID
|
|
9
|
-
|
|
10
|
-
from haiku.rag.store.engine import Store
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def deserialize_sqlite_embedding(data: bytes) -> list[float]:
|
|
14
|
-
"""Deserialize sqlite-vec embedding from bytes."""
|
|
15
|
-
if not data:
|
|
16
|
-
return []
|
|
17
|
-
# sqlite-vec stores embeddings as float32 arrays
|
|
18
|
-
num_floats = len(data) // 4
|
|
19
|
-
return list(struct.unpack(f"{num_floats}f", data))
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class SQLiteToLanceDBMigrator:
|
|
23
|
-
"""Migrates data from SQLite to LanceDB."""
|
|
24
|
-
|
|
25
|
-
def __init__(self, sqlite_path: Path, lancedb_path: Path):
|
|
26
|
-
self.sqlite_path = sqlite_path
|
|
27
|
-
self.lancedb_path = lancedb_path
|
|
28
|
-
self.console = Console()
|
|
29
|
-
|
|
30
|
-
def migrate(self) -> bool:
|
|
31
|
-
"""Perform the migration."""
|
|
32
|
-
try:
|
|
33
|
-
self.console.print(
|
|
34
|
-
f"[blue]Starting migration from {self.sqlite_path} to {self.lancedb_path}[/blue]"
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
# Check if SQLite database exists
|
|
38
|
-
if not self.sqlite_path.exists():
|
|
39
|
-
self.console.print(
|
|
40
|
-
f"[red]SQLite database not found: {self.sqlite_path}[/red]"
|
|
41
|
-
)
|
|
42
|
-
return False
|
|
43
|
-
|
|
44
|
-
# Connect to SQLite database
|
|
45
|
-
sqlite_conn = sqlite3.connect(self.sqlite_path)
|
|
46
|
-
sqlite_conn.row_factory = sqlite3.Row
|
|
47
|
-
|
|
48
|
-
# Load the sqlite-vec extension
|
|
49
|
-
try:
|
|
50
|
-
import sqlite_vec # type: ignore
|
|
51
|
-
|
|
52
|
-
sqlite_conn.enable_load_extension(True)
|
|
53
|
-
sqlite_vec.load(sqlite_conn)
|
|
54
|
-
self.console.print("[cyan]Loaded sqlite-vec extension[/cyan]")
|
|
55
|
-
except Exception as e:
|
|
56
|
-
self.console.print(
|
|
57
|
-
f"[yellow]Warning: Could not load sqlite-vec extension: {e}[/yellow]"
|
|
58
|
-
)
|
|
59
|
-
self.console.print(
|
|
60
|
-
"[yellow]Install sqlite-vec with[/yellow]\n[green]uv pip install sqlite-vec [/green]"
|
|
61
|
-
)
|
|
62
|
-
exit(1)
|
|
63
|
-
|
|
64
|
-
# Create LanceDB store
|
|
65
|
-
lance_store = Store(self.lancedb_path, skip_validation=True)
|
|
66
|
-
|
|
67
|
-
with Progress() as progress:
|
|
68
|
-
# Migrate documents
|
|
69
|
-
doc_task = progress.add_task(
|
|
70
|
-
"[green]Migrating documents...", total=None
|
|
71
|
-
)
|
|
72
|
-
document_id_mapping = self._migrate_documents(
|
|
73
|
-
sqlite_conn, lance_store, progress, doc_task
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
# Migrate chunks and embeddings
|
|
77
|
-
chunk_task = progress.add_task(
|
|
78
|
-
"[yellow]Migrating chunks and embeddings...", total=None
|
|
79
|
-
)
|
|
80
|
-
self._migrate_chunks(
|
|
81
|
-
sqlite_conn, lance_store, progress, chunk_task, document_id_mapping
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
# Migrate settings
|
|
85
|
-
settings_task = progress.add_task(
|
|
86
|
-
"[blue]Migrating settings...", total=None
|
|
87
|
-
)
|
|
88
|
-
self._migrate_settings(
|
|
89
|
-
sqlite_conn, lance_store, progress, settings_task
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
sqlite_conn.close()
|
|
93
|
-
|
|
94
|
-
# Optimize and cleanup using centralized vacuum
|
|
95
|
-
self.console.print("[cyan]Optimizing LanceDB...[/cyan]")
|
|
96
|
-
try:
|
|
97
|
-
lance_store.vacuum()
|
|
98
|
-
self.console.print("[green]✅ Optimization completed[/green]")
|
|
99
|
-
except Exception as e:
|
|
100
|
-
self.console.print(
|
|
101
|
-
f"[yellow]Warning: Optimization failed: {e}[/yellow]"
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
lance_store.close()
|
|
105
|
-
|
|
106
|
-
self.console.print("[green]✅ Migration completed successfully![/green]")
|
|
107
|
-
self.console.print(
|
|
108
|
-
f"[green]✅ Migrated {len(document_id_mapping)} documents[/green]"
|
|
109
|
-
)
|
|
110
|
-
return True
|
|
111
|
-
|
|
112
|
-
except Exception as e:
|
|
113
|
-
self.console.print(f"[red]❌ Migration failed: {e}[/red]")
|
|
114
|
-
import traceback
|
|
115
|
-
|
|
116
|
-
self.console.print(f"[red]{traceback.format_exc()}[/red]")
|
|
117
|
-
return False
|
|
118
|
-
|
|
119
|
-
def _migrate_documents(
|
|
120
|
-
self,
|
|
121
|
-
sqlite_conn: sqlite3.Connection,
|
|
122
|
-
lance_store: Store,
|
|
123
|
-
progress: Progress,
|
|
124
|
-
task: TaskID,
|
|
125
|
-
) -> dict[int, str]:
|
|
126
|
-
"""Migrate documents from SQLite to LanceDB and return ID mapping."""
|
|
127
|
-
cursor = sqlite_conn.cursor()
|
|
128
|
-
cursor.execute(
|
|
129
|
-
"SELECT id, content, uri, metadata, created_at, updated_at FROM documents ORDER BY id"
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
documents = []
|
|
133
|
-
id_mapping = {} # Maps old integer ID to new UUID
|
|
134
|
-
|
|
135
|
-
for row in cursor.fetchall():
|
|
136
|
-
new_uuid = str(uuid4())
|
|
137
|
-
id_mapping[row["id"]] = new_uuid
|
|
138
|
-
|
|
139
|
-
doc_data = {
|
|
140
|
-
"id": new_uuid,
|
|
141
|
-
"content": row["content"],
|
|
142
|
-
"uri": row["uri"],
|
|
143
|
-
"metadata": json.loads(row["metadata"]) if row["metadata"] else {},
|
|
144
|
-
"created_at": row["created_at"],
|
|
145
|
-
"updated_at": row["updated_at"],
|
|
146
|
-
}
|
|
147
|
-
documents.append(doc_data)
|
|
148
|
-
|
|
149
|
-
# Batch insert documents to LanceDB
|
|
150
|
-
if documents:
|
|
151
|
-
from haiku.rag.store.engine import DocumentRecord
|
|
152
|
-
|
|
153
|
-
doc_records = [
|
|
154
|
-
DocumentRecord(
|
|
155
|
-
id=doc["id"],
|
|
156
|
-
content=doc["content"],
|
|
157
|
-
uri=doc["uri"],
|
|
158
|
-
metadata=json.dumps(doc["metadata"]),
|
|
159
|
-
created_at=doc["created_at"],
|
|
160
|
-
updated_at=doc["updated_at"],
|
|
161
|
-
)
|
|
162
|
-
for doc in documents
|
|
163
|
-
]
|
|
164
|
-
lance_store.documents_table.add(doc_records)
|
|
165
|
-
|
|
166
|
-
progress.update(task, completed=len(documents), total=len(documents))
|
|
167
|
-
return id_mapping
|
|
168
|
-
|
|
169
|
-
def _migrate_chunks(
|
|
170
|
-
self,
|
|
171
|
-
sqlite_conn: sqlite3.Connection,
|
|
172
|
-
lance_store: Store,
|
|
173
|
-
progress: Progress,
|
|
174
|
-
task: TaskID,
|
|
175
|
-
document_id_mapping: dict[int, str],
|
|
176
|
-
):
|
|
177
|
-
"""Migrate chunks and embeddings from SQLite to LanceDB."""
|
|
178
|
-
cursor = sqlite_conn.cursor()
|
|
179
|
-
|
|
180
|
-
# Get chunks first
|
|
181
|
-
cursor.execute("""
|
|
182
|
-
SELECT id, document_id, content, metadata
|
|
183
|
-
FROM chunks
|
|
184
|
-
ORDER BY id
|
|
185
|
-
""")
|
|
186
|
-
|
|
187
|
-
chunks_data = cursor.fetchall()
|
|
188
|
-
|
|
189
|
-
# Get embeddings using the sqlite-vec virtual table
|
|
190
|
-
embeddings_map = {}
|
|
191
|
-
try:
|
|
192
|
-
# Use the virtual table to get embeddings properly
|
|
193
|
-
cursor.execute("""
|
|
194
|
-
SELECT chunk_id, embedding
|
|
195
|
-
FROM chunk_embeddings
|
|
196
|
-
""")
|
|
197
|
-
|
|
198
|
-
for row in cursor.fetchall():
|
|
199
|
-
chunk_id = row[0]
|
|
200
|
-
embedding_blob = row[1]
|
|
201
|
-
if embedding_blob and chunk_id not in embeddings_map:
|
|
202
|
-
embeddings_map[chunk_id] = embedding_blob
|
|
203
|
-
|
|
204
|
-
except sqlite3.OperationalError as e:
|
|
205
|
-
self.console.print(
|
|
206
|
-
f"[yellow]Warning: Could not extract embeddings from virtual table: {e}[/yellow]"
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
chunks = []
|
|
210
|
-
for row in chunks_data:
|
|
211
|
-
# Generate new UUID for chunk
|
|
212
|
-
chunk_uuid = str(uuid4())
|
|
213
|
-
|
|
214
|
-
# Map the old document_id to new UUID
|
|
215
|
-
document_uuid = document_id_mapping.get(row["document_id"])
|
|
216
|
-
if not document_uuid:
|
|
217
|
-
self.console.print(
|
|
218
|
-
f"[yellow]Warning: Document ID {row['document_id']} not found in mapping for chunk {row['id']}[/yellow]"
|
|
219
|
-
)
|
|
220
|
-
continue
|
|
221
|
-
|
|
222
|
-
# Get embedding for this chunk
|
|
223
|
-
embedding = []
|
|
224
|
-
embedding_blob = embeddings_map.get(row["id"])
|
|
225
|
-
if embedding_blob:
|
|
226
|
-
try:
|
|
227
|
-
embedding = deserialize_sqlite_embedding(embedding_blob)
|
|
228
|
-
except Exception as e:
|
|
229
|
-
self.console.print(
|
|
230
|
-
f"[yellow]Warning: Failed to deserialize embedding for chunk {row['id']}: {e}[/yellow]"
|
|
231
|
-
)
|
|
232
|
-
# Generate a zero vector of the expected dimension
|
|
233
|
-
embedding = [0.0] * lance_store.embedder._vector_dim
|
|
234
|
-
else:
|
|
235
|
-
# No embedding found, generate zero vector
|
|
236
|
-
embedding = [0.0] * lance_store.embedder._vector_dim
|
|
237
|
-
|
|
238
|
-
chunk_data = {
|
|
239
|
-
"id": chunk_uuid,
|
|
240
|
-
"document_id": document_uuid,
|
|
241
|
-
"content": row["content"],
|
|
242
|
-
"metadata": json.loads(row["metadata"]) if row["metadata"] else {},
|
|
243
|
-
"vector": embedding,
|
|
244
|
-
}
|
|
245
|
-
chunks.append(chunk_data)
|
|
246
|
-
|
|
247
|
-
# Batch insert chunks to LanceDB
|
|
248
|
-
if chunks:
|
|
249
|
-
chunk_records = [
|
|
250
|
-
lance_store.ChunkRecord(
|
|
251
|
-
id=chunk["id"],
|
|
252
|
-
document_id=chunk["document_id"],
|
|
253
|
-
content=chunk["content"],
|
|
254
|
-
metadata=json.dumps(chunk["metadata"]),
|
|
255
|
-
vector=chunk["vector"],
|
|
256
|
-
)
|
|
257
|
-
for chunk in chunks
|
|
258
|
-
]
|
|
259
|
-
lance_store.chunks_table.add(chunk_records)
|
|
260
|
-
|
|
261
|
-
progress.update(task, completed=len(chunks), total=len(chunks))
|
|
262
|
-
|
|
263
|
-
def _migrate_settings(
|
|
264
|
-
self,
|
|
265
|
-
sqlite_conn: sqlite3.Connection,
|
|
266
|
-
lance_store: Store,
|
|
267
|
-
progress: Progress,
|
|
268
|
-
task: TaskID,
|
|
269
|
-
):
|
|
270
|
-
"""Migrate settings from SQLite to LanceDB."""
|
|
271
|
-
cursor = sqlite_conn.cursor()
|
|
272
|
-
|
|
273
|
-
try:
|
|
274
|
-
cursor.execute("SELECT id, settings FROM settings WHERE id = 1")
|
|
275
|
-
row = cursor.fetchone()
|
|
276
|
-
|
|
277
|
-
if row:
|
|
278
|
-
settings_data = json.loads(row["settings"]) if row["settings"] else {}
|
|
279
|
-
|
|
280
|
-
# Update the existing settings in LanceDB (use string ID)
|
|
281
|
-
lance_store.settings_table.update(
|
|
282
|
-
where="id = 'settings'",
|
|
283
|
-
values={"settings": json.dumps(settings_data)},
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
progress.update(task, completed=1, total=1)
|
|
287
|
-
else:
|
|
288
|
-
progress.update(task, completed=0, total=0)
|
|
289
|
-
|
|
290
|
-
except sqlite3.OperationalError:
|
|
291
|
-
# Settings table doesn't exist in old SQLite database
|
|
292
|
-
self.console.print(
|
|
293
|
-
"[yellow]No settings table found in SQLite database[/yellow]"
|
|
294
|
-
)
|
|
295
|
-
progress.update(task, completed=0, total=0)
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
async def migrate_sqlite_to_lancedb(
|
|
299
|
-
sqlite_path: Path, lancedb_path: Path | None = None
|
|
300
|
-
) -> bool:
|
|
301
|
-
"""
|
|
302
|
-
Migrate an existing SQLite database to LanceDB.
|
|
303
|
-
|
|
304
|
-
Args:
|
|
305
|
-
sqlite_path: Path to the existing SQLite database
|
|
306
|
-
lancedb_path: Path for the new LanceDB database (optional, will auto-generate if not provided)
|
|
307
|
-
|
|
308
|
-
Returns:
|
|
309
|
-
True if migration was successful, False otherwise
|
|
310
|
-
"""
|
|
311
|
-
if lancedb_path is None:
|
|
312
|
-
# Auto-generate LanceDB path
|
|
313
|
-
lancedb_path = sqlite_path.parent / (sqlite_path.stem + ".lancedb")
|
|
314
|
-
|
|
315
|
-
migrator = SQLiteToLanceDBMigrator(sqlite_path, lancedb_path)
|
|
316
|
-
return migrator.migrate()
|