haiku.rag 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/app.py +4 -4
- haiku/rag/cli.py +38 -27
- haiku/rag/client.py +19 -23
- haiku/rag/config.py +6 -2
- haiku/rag/logging.py +4 -0
- haiku/rag/mcp.py +12 -9
- haiku/rag/migration.py +316 -0
- haiku/rag/reranking/__init__.py +0 -6
- haiku/rag/store/engine.py +173 -141
- haiku/rag/store/models/chunk.py +2 -2
- haiku/rag/store/models/document.py +1 -1
- haiku/rag/store/repositories/__init__.py +6 -2
- haiku/rag/store/repositories/chunk.py +279 -414
- haiku/rag/store/repositories/document.py +171 -205
- haiku/rag/store/repositories/settings.py +115 -49
- haiku/rag/store/upgrades/__init__.py +1 -3
- haiku/rag/utils.py +39 -31
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/METADATA +21 -16
- haiku_rag-0.7.0.dist-info/RECORD +39 -0
- haiku/rag/reranking/ollama.py +0 -81
- haiku/rag/store/repositories/base.py +0 -40
- haiku/rag/store/upgrades/v0_3_4.py +0 -26
- haiku_rag-0.6.0.dist-info/RECORD +0 -41
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/WHEEL +0 -0
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/migration.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Migration script to migrate from SQLite to LanceDB.
|
|
4
|
+
|
|
5
|
+
This script will:
|
|
6
|
+
1. Read data from an existing SQLite database
|
|
7
|
+
2. Create a new LanceDB database with the same data
|
|
8
|
+
3. Preserve all documents, chunks, embeddings, and settings
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import sqlite3
|
|
13
|
+
import struct
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from uuid import uuid4
|
|
16
|
+
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.progress import Progress, TaskID
|
|
19
|
+
|
|
20
|
+
from haiku.rag.store.engine import Store
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def deserialize_sqlite_embedding(data: bytes) -> list[float]:
|
|
24
|
+
"""Deserialize sqlite-vec embedding from bytes."""
|
|
25
|
+
if not data:
|
|
26
|
+
return []
|
|
27
|
+
# sqlite-vec stores embeddings as float32 arrays
|
|
28
|
+
num_floats = len(data) // 4
|
|
29
|
+
return list(struct.unpack(f"{num_floats}f", data))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SQLiteToLanceDBMigrator:
|
|
33
|
+
"""Migrates data from SQLite to LanceDB."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, sqlite_path: Path, lancedb_path: Path):
|
|
36
|
+
self.sqlite_path = sqlite_path
|
|
37
|
+
self.lancedb_path = lancedb_path
|
|
38
|
+
self.console = Console()
|
|
39
|
+
|
|
40
|
+
def migrate(self) -> bool:
|
|
41
|
+
"""Perform the migration."""
|
|
42
|
+
try:
|
|
43
|
+
self.console.print(
|
|
44
|
+
f"[blue]Starting migration from {self.sqlite_path} to {self.lancedb_path}[/blue]"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Check if SQLite database exists
|
|
48
|
+
if not self.sqlite_path.exists():
|
|
49
|
+
self.console.print(
|
|
50
|
+
f"[red]SQLite database not found: {self.sqlite_path}[/red]"
|
|
51
|
+
)
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
# Connect to SQLite database
|
|
55
|
+
sqlite_conn = sqlite3.connect(self.sqlite_path)
|
|
56
|
+
sqlite_conn.row_factory = sqlite3.Row
|
|
57
|
+
|
|
58
|
+
# Create LanceDB store
|
|
59
|
+
lance_store = Store(self.lancedb_path, skip_validation=True)
|
|
60
|
+
|
|
61
|
+
with Progress() as progress:
|
|
62
|
+
# Migrate documents
|
|
63
|
+
doc_task = progress.add_task(
|
|
64
|
+
"[green]Migrating documents...", total=None
|
|
65
|
+
)
|
|
66
|
+
document_id_mapping = self._migrate_documents(
|
|
67
|
+
sqlite_conn, lance_store, progress, doc_task
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Migrate chunks and embeddings
|
|
71
|
+
chunk_task = progress.add_task(
|
|
72
|
+
"[yellow]Migrating chunks and embeddings...", total=None
|
|
73
|
+
)
|
|
74
|
+
self._migrate_chunks(
|
|
75
|
+
sqlite_conn, lance_store, progress, chunk_task, document_id_mapping
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Migrate settings
|
|
79
|
+
settings_task = progress.add_task(
|
|
80
|
+
"[blue]Migrating settings...", total=None
|
|
81
|
+
)
|
|
82
|
+
self._migrate_settings(
|
|
83
|
+
sqlite_conn, lance_store, progress, settings_task
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
sqlite_conn.close()
|
|
87
|
+
|
|
88
|
+
# Optimize the chunks table after migration
|
|
89
|
+
self.console.print("[blue]Optimizing LanceDB...[/blue]")
|
|
90
|
+
try:
|
|
91
|
+
lance_store.chunks_table.optimize()
|
|
92
|
+
self.console.print("[green]✅ Optimization completed[/green]")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
self.console.print(
|
|
95
|
+
f"[yellow]Warning: Optimization failed: {e}[/yellow]"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
lance_store.close()
|
|
99
|
+
|
|
100
|
+
self.console.print("[green]✅ Migration completed successfully![/green]")
|
|
101
|
+
self.console.print(
|
|
102
|
+
f"[green]✅ Migrated {len(document_id_mapping)} documents[/green]"
|
|
103
|
+
)
|
|
104
|
+
return True
|
|
105
|
+
|
|
106
|
+
except Exception as e:
|
|
107
|
+
self.console.print(f"[red]❌ Migration failed: {e}[/red]")
|
|
108
|
+
import traceback
|
|
109
|
+
|
|
110
|
+
self.console.print(f"[red]{traceback.format_exc()}[/red]")
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
def _migrate_documents(
|
|
114
|
+
self,
|
|
115
|
+
sqlite_conn: sqlite3.Connection,
|
|
116
|
+
lance_store: Store,
|
|
117
|
+
progress: Progress,
|
|
118
|
+
task: TaskID,
|
|
119
|
+
) -> dict[int, str]:
|
|
120
|
+
"""Migrate documents from SQLite to LanceDB and return ID mapping."""
|
|
121
|
+
cursor = sqlite_conn.cursor()
|
|
122
|
+
cursor.execute(
|
|
123
|
+
"SELECT id, content, uri, metadata, created_at, updated_at FROM documents ORDER BY id"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
documents = []
|
|
127
|
+
id_mapping = {} # Maps old integer ID to new UUID
|
|
128
|
+
|
|
129
|
+
for row in cursor.fetchall():
|
|
130
|
+
new_uuid = str(uuid4())
|
|
131
|
+
id_mapping[row["id"]] = new_uuid
|
|
132
|
+
|
|
133
|
+
doc_data = {
|
|
134
|
+
"id": new_uuid,
|
|
135
|
+
"content": row["content"],
|
|
136
|
+
"uri": row["uri"],
|
|
137
|
+
"metadata": json.loads(row["metadata"]) if row["metadata"] else {},
|
|
138
|
+
"created_at": row["created_at"],
|
|
139
|
+
"updated_at": row["updated_at"],
|
|
140
|
+
}
|
|
141
|
+
documents.append(doc_data)
|
|
142
|
+
|
|
143
|
+
# Batch insert documents to LanceDB
|
|
144
|
+
if documents:
|
|
145
|
+
from haiku.rag.store.engine import DocumentRecord
|
|
146
|
+
|
|
147
|
+
doc_records = [
|
|
148
|
+
DocumentRecord(
|
|
149
|
+
id=doc["id"],
|
|
150
|
+
content=doc["content"],
|
|
151
|
+
uri=doc["uri"],
|
|
152
|
+
metadata=json.dumps(doc["metadata"]),
|
|
153
|
+
created_at=doc["created_at"],
|
|
154
|
+
updated_at=doc["updated_at"],
|
|
155
|
+
)
|
|
156
|
+
for doc in documents
|
|
157
|
+
]
|
|
158
|
+
lance_store.documents_table.add(doc_records)
|
|
159
|
+
|
|
160
|
+
progress.update(task, completed=len(documents), total=len(documents))
|
|
161
|
+
return id_mapping
|
|
162
|
+
|
|
163
|
+
def _migrate_chunks(
|
|
164
|
+
self,
|
|
165
|
+
sqlite_conn: sqlite3.Connection,
|
|
166
|
+
lance_store: Store,
|
|
167
|
+
progress: Progress,
|
|
168
|
+
task: TaskID,
|
|
169
|
+
document_id_mapping: dict[int, str],
|
|
170
|
+
):
|
|
171
|
+
"""Migrate chunks and embeddings from SQLite to LanceDB."""
|
|
172
|
+
cursor = sqlite_conn.cursor()
|
|
173
|
+
|
|
174
|
+
# Get chunks first
|
|
175
|
+
cursor.execute("""
|
|
176
|
+
SELECT id, document_id, content, metadata
|
|
177
|
+
FROM chunks
|
|
178
|
+
ORDER BY id
|
|
179
|
+
""")
|
|
180
|
+
|
|
181
|
+
chunks_data = cursor.fetchall()
|
|
182
|
+
|
|
183
|
+
# Get embeddings separately to avoid vec0 virtual table issues
|
|
184
|
+
embeddings_map = {}
|
|
185
|
+
try:
|
|
186
|
+
# Try to get embeddings from the vec0 tables directly
|
|
187
|
+
cursor.execute("""
|
|
188
|
+
SELECT
|
|
189
|
+
r.chunk_id,
|
|
190
|
+
v.vectors
|
|
191
|
+
FROM chunk_embeddings_rowids r
|
|
192
|
+
JOIN chunk_embeddings_vector_chunks00 v ON r.rowid = v.rowid
|
|
193
|
+
""")
|
|
194
|
+
|
|
195
|
+
for row in cursor.fetchall():
|
|
196
|
+
chunk_id = row[0]
|
|
197
|
+
vectors_blob = row[1]
|
|
198
|
+
if vectors_blob and chunk_id not in embeddings_map:
|
|
199
|
+
embeddings_map[chunk_id] = vectors_blob
|
|
200
|
+
|
|
201
|
+
except sqlite3.OperationalError as e:
|
|
202
|
+
self.console.print(
|
|
203
|
+
f"[yellow]Warning: Could not extract embeddings: {e}[/yellow]"
|
|
204
|
+
)
|
|
205
|
+
self.console.print(
|
|
206
|
+
"[yellow]Continuing migration without embeddings...[/yellow]"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
chunks = []
|
|
210
|
+
for row in chunks_data:
|
|
211
|
+
# Generate new UUID for chunk
|
|
212
|
+
chunk_uuid = str(uuid4())
|
|
213
|
+
|
|
214
|
+
# Map the old document_id to new UUID
|
|
215
|
+
document_uuid = document_id_mapping.get(row["document_id"])
|
|
216
|
+
if not document_uuid:
|
|
217
|
+
self.console.print(
|
|
218
|
+
f"[yellow]Warning: Document ID {row['document_id']} not found in mapping for chunk {row['id']}[/yellow]"
|
|
219
|
+
)
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Get embedding for this chunk
|
|
223
|
+
embedding = []
|
|
224
|
+
embedding_blob = embeddings_map.get(row["id"])
|
|
225
|
+
if embedding_blob:
|
|
226
|
+
try:
|
|
227
|
+
embedding = deserialize_sqlite_embedding(embedding_blob)
|
|
228
|
+
except Exception as e:
|
|
229
|
+
self.console.print(
|
|
230
|
+
f"[yellow]Warning: Failed to deserialize embedding for chunk {row['id']}: {e}[/yellow]"
|
|
231
|
+
)
|
|
232
|
+
# Generate a zero vector of the expected dimension
|
|
233
|
+
embedding = [0.0] * lance_store.embedder._vector_dim
|
|
234
|
+
else:
|
|
235
|
+
# No embedding found, generate zero vector
|
|
236
|
+
embedding = [0.0] * lance_store.embedder._vector_dim
|
|
237
|
+
|
|
238
|
+
chunk_data = {
|
|
239
|
+
"id": chunk_uuid,
|
|
240
|
+
"document_id": document_uuid,
|
|
241
|
+
"content": row["content"],
|
|
242
|
+
"metadata": json.loads(row["metadata"]) if row["metadata"] else {},
|
|
243
|
+
"vector": embedding,
|
|
244
|
+
}
|
|
245
|
+
chunks.append(chunk_data)
|
|
246
|
+
|
|
247
|
+
# Batch insert chunks to LanceDB
|
|
248
|
+
if chunks:
|
|
249
|
+
chunk_records = [
|
|
250
|
+
lance_store.ChunkRecord(
|
|
251
|
+
id=chunk["id"],
|
|
252
|
+
document_id=chunk["document_id"],
|
|
253
|
+
content=chunk["content"],
|
|
254
|
+
metadata=json.dumps(chunk["metadata"]),
|
|
255
|
+
vector=chunk["vector"],
|
|
256
|
+
)
|
|
257
|
+
for chunk in chunks
|
|
258
|
+
]
|
|
259
|
+
lance_store.chunks_table.add(chunk_records)
|
|
260
|
+
|
|
261
|
+
progress.update(task, completed=len(chunks), total=len(chunks))
|
|
262
|
+
|
|
263
|
+
def _migrate_settings(
|
|
264
|
+
self,
|
|
265
|
+
sqlite_conn: sqlite3.Connection,
|
|
266
|
+
lance_store: Store,
|
|
267
|
+
progress: Progress,
|
|
268
|
+
task: TaskID,
|
|
269
|
+
):
|
|
270
|
+
"""Migrate settings from SQLite to LanceDB."""
|
|
271
|
+
cursor = sqlite_conn.cursor()
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
cursor.execute("SELECT id, settings FROM settings WHERE id = 1")
|
|
275
|
+
row = cursor.fetchone()
|
|
276
|
+
|
|
277
|
+
if row:
|
|
278
|
+
settings_data = json.loads(row["settings"]) if row["settings"] else {}
|
|
279
|
+
|
|
280
|
+
# Update the existing settings in LanceDB (use string ID)
|
|
281
|
+
lance_store.settings_table.update(
|
|
282
|
+
where="id = 'settings'",
|
|
283
|
+
values={"settings": json.dumps(settings_data)},
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
progress.update(task, completed=1, total=1)
|
|
287
|
+
else:
|
|
288
|
+
progress.update(task, completed=0, total=0)
|
|
289
|
+
|
|
290
|
+
except sqlite3.OperationalError:
|
|
291
|
+
# Settings table doesn't exist in old SQLite database
|
|
292
|
+
self.console.print(
|
|
293
|
+
"[yellow]No settings table found in SQLite database[/yellow]"
|
|
294
|
+
)
|
|
295
|
+
progress.update(task, completed=0, total=0)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
async def migrate_sqlite_to_lancedb(
|
|
299
|
+
sqlite_path: Path, lancedb_path: Path | None = None
|
|
300
|
+
) -> bool:
|
|
301
|
+
"""
|
|
302
|
+
Migrate an existing SQLite database to LanceDB.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
sqlite_path: Path to the existing SQLite database
|
|
306
|
+
lancedb_path: Path for the new LanceDB database (optional, will auto-generate if not provided)
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
True if migration was successful, False otherwise
|
|
310
|
+
"""
|
|
311
|
+
if lancedb_path is None:
|
|
312
|
+
# Auto-generate LanceDB path
|
|
313
|
+
lancedb_path = sqlite_path.parent / (sqlite_path.stem + ".lancedb")
|
|
314
|
+
|
|
315
|
+
migrator = SQLiteToLanceDBMigrator(sqlite_path, lancedb_path)
|
|
316
|
+
return migrator.migrate()
|
haiku/rag/reranking/__init__.py
CHANGED
haiku/rag/store/engine.py
CHANGED
|
@@ -1,171 +1,203 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
3
|
from importlib import metadata
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from
|
|
5
|
+
from uuid import uuid4
|
|
6
6
|
|
|
7
|
-
import
|
|
8
|
-
from
|
|
9
|
-
from
|
|
7
|
+
import lancedb
|
|
8
|
+
from lancedb.pydantic import LanceModel, Vector
|
|
9
|
+
from pydantic import Field
|
|
10
10
|
|
|
11
11
|
from haiku.rag.config import Config
|
|
12
12
|
from haiku.rag.embeddings import get_embedder
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocumentRecord(LanceModel):
|
|
18
|
+
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
19
|
+
content: str
|
|
20
|
+
uri: str | None = None
|
|
21
|
+
metadata: str = Field(default="{}")
|
|
22
|
+
created_at: str = Field(default_factory=lambda: "")
|
|
23
|
+
updated_at: str = Field(default_factory=lambda: "")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def create_chunk_model(vector_dim: int):
|
|
27
|
+
"""Create a ChunkRecord model with the specified vector dimension.
|
|
28
|
+
|
|
29
|
+
This creates a model with proper vector typing for LanceDB.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
class ChunkRecord(LanceModel):
|
|
33
|
+
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
34
|
+
document_id: str
|
|
35
|
+
content: str
|
|
36
|
+
metadata: str = Field(default="{}")
|
|
37
|
+
vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
|
|
38
|
+
|
|
39
|
+
return ChunkRecord
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SettingsRecord(LanceModel):
|
|
43
|
+
id: str = Field(default="settings")
|
|
44
|
+
settings: str = Field(default="{}")
|
|
15
45
|
|
|
16
46
|
|
|
17
47
|
class Store:
|
|
18
|
-
def __init__(
|
|
19
|
-
self
|
|
20
|
-
|
|
21
|
-
|
|
48
|
+
def __init__(self, db_path: Path, skip_validation: bool = False):
|
|
49
|
+
self.db_path: Path = db_path
|
|
50
|
+
self.embedder = get_embedder()
|
|
51
|
+
|
|
52
|
+
# Create the ChunkRecord model with the correct vector dimension
|
|
53
|
+
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
54
|
+
|
|
55
|
+
# Connect to LanceDB
|
|
56
|
+
self.db = self._connect_to_lancedb(db_path)
|
|
57
|
+
|
|
58
|
+
# Initialize tables
|
|
22
59
|
self.create_or_update_db()
|
|
23
60
|
|
|
24
61
|
# Validate config compatibility after connection is established
|
|
25
62
|
if not skip_validation:
|
|
26
|
-
|
|
63
|
+
self._validate_configuration()
|
|
64
|
+
|
|
65
|
+
def _connect_to_lancedb(self, db_path: Path):
|
|
66
|
+
"""Establish connection to LanceDB (local, cloud, or object storage)."""
|
|
67
|
+
# Check if we have cloud configuration
|
|
68
|
+
if self._has_cloud_config():
|
|
69
|
+
return lancedb.connect(
|
|
70
|
+
uri=Config.LANCEDB_URI,
|
|
71
|
+
api_key=Config.LANCEDB_API_KEY,
|
|
72
|
+
region=Config.LANCEDB_REGION,
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
# Local file system connection
|
|
76
|
+
return lancedb.connect(db_path)
|
|
77
|
+
|
|
78
|
+
def _has_cloud_config(self) -> bool:
|
|
79
|
+
"""Check if cloud configuration is complete."""
|
|
80
|
+
return bool(
|
|
81
|
+
Config.LANCEDB_URI and Config.LANCEDB_API_KEY and Config.LANCEDB_REGION
|
|
82
|
+
)
|
|
27
83
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
84
|
+
def _validate_configuration(self) -> None:
|
|
85
|
+
"""Validate that the configuration is compatible with the database."""
|
|
86
|
+
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
87
|
+
|
|
88
|
+
settings_repo = SettingsRepository(self)
|
|
89
|
+
settings_repo.validate_config_compatibility()
|
|
32
90
|
|
|
33
91
|
def create_or_update_db(self):
|
|
34
|
-
"""Create the database
|
|
35
|
-
current_version = metadata.version("haiku.rag")
|
|
92
|
+
"""Create the database tables."""
|
|
36
93
|
|
|
37
|
-
|
|
38
|
-
db.
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
existing_tables = [
|
|
47
|
-
row[0]
|
|
48
|
-
for row in db.execute(
|
|
49
|
-
"SELECT name FROM sqlite_master WHERE type='table';"
|
|
50
|
-
).fetchall()
|
|
51
|
-
]
|
|
52
|
-
|
|
53
|
-
# If we have a db already, perform upgrades and return
|
|
54
|
-
if self.db_path != ":memory:" and "documents" in existing_tables:
|
|
55
|
-
# Upgrade database
|
|
56
|
-
console = Console()
|
|
57
|
-
db_version = self.get_user_version()
|
|
58
|
-
for version, steps in upgrades:
|
|
59
|
-
if parse(current_version) >= parse(version) and parse(version) > parse(
|
|
60
|
-
db_version
|
|
61
|
-
):
|
|
62
|
-
for step in steps:
|
|
63
|
-
step(db)
|
|
64
|
-
console.print(
|
|
65
|
-
f"[green][b]DB Upgrade: [/b]{step.__doc__}[/green]"
|
|
66
|
-
)
|
|
67
|
-
return
|
|
68
|
-
|
|
69
|
-
# Create documents table
|
|
70
|
-
db.execute("""
|
|
71
|
-
CREATE TABLE IF NOT EXISTS documents (
|
|
72
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
73
|
-
content TEXT NOT NULL,
|
|
74
|
-
uri TEXT,
|
|
75
|
-
metadata TEXT DEFAULT '{}',
|
|
76
|
-
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
77
|
-
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
78
|
-
)
|
|
79
|
-
""")
|
|
80
|
-
# Create chunks table
|
|
81
|
-
db.execute("""
|
|
82
|
-
CREATE TABLE IF NOT EXISTS chunks (
|
|
83
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
84
|
-
document_id INTEGER NOT NULL,
|
|
85
|
-
content TEXT NOT NULL,
|
|
86
|
-
metadata TEXT DEFAULT '{}',
|
|
87
|
-
FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE
|
|
88
|
-
)
|
|
89
|
-
""")
|
|
90
|
-
# Create vector table for chunk embeddings
|
|
91
|
-
embedder = get_embedder()
|
|
92
|
-
db.execute(f"""
|
|
93
|
-
CREATE VIRTUAL TABLE IF NOT EXISTS chunk_embeddings USING vec0(
|
|
94
|
-
chunk_id INTEGER PRIMARY KEY,
|
|
95
|
-
embedding FLOAT[{embedder._vector_dim}]
|
|
94
|
+
# Get list of existing tables
|
|
95
|
+
existing_tables = self.db.table_names()
|
|
96
|
+
|
|
97
|
+
# Create or get documents table
|
|
98
|
+
if "documents" in existing_tables:
|
|
99
|
+
self.documents_table = self.db.open_table("documents")
|
|
100
|
+
else:
|
|
101
|
+
self.documents_table = self.db.create_table(
|
|
102
|
+
"documents", schema=DocumentRecord
|
|
96
103
|
)
|
|
97
|
-
|
|
98
|
-
# Create
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
+
|
|
105
|
+
# Create or get chunks table
|
|
106
|
+
if "chunks" in existing_tables:
|
|
107
|
+
self.chunks_table = self.db.open_table("chunks")
|
|
108
|
+
else:
|
|
109
|
+
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
110
|
+
# Create FTS index on the new table
|
|
111
|
+
self.chunks_table.create_fts_index("content", replace=True)
|
|
112
|
+
|
|
113
|
+
# Create or get settings table
|
|
114
|
+
if "settings" in existing_tables:
|
|
115
|
+
self.settings_table = self.db.open_table("settings")
|
|
116
|
+
else:
|
|
117
|
+
self.settings_table = self.db.create_table(
|
|
118
|
+
"settings", schema=SettingsRecord
|
|
104
119
|
)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
id INTEGER PRIMARY KEY DEFAULT 1,
|
|
110
|
-
settings TEXT NOT NULL DEFAULT '{}'
|
|
120
|
+
# Save current settings to the new database
|
|
121
|
+
settings_data = Config.model_dump(mode="json")
|
|
122
|
+
self.settings_table.add(
|
|
123
|
+
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
111
124
|
)
|
|
112
|
-
""")
|
|
113
|
-
# Save current settings to the new database
|
|
114
|
-
settings_json = Config.model_dump_json()
|
|
115
|
-
db.execute(
|
|
116
|
-
"INSERT OR IGNORE INTO settings (id, settings) VALUES (1, ?)",
|
|
117
|
-
(settings_json,),
|
|
118
|
-
)
|
|
119
|
-
# Create indexes for better performance
|
|
120
|
-
db.execute(
|
|
121
|
-
"CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)"
|
|
122
|
-
)
|
|
123
|
-
db.commit()
|
|
124
|
-
|
|
125
|
-
def get_user_version(self) -> str:
|
|
126
|
-
"""Returns the SQLite user version"""
|
|
127
|
-
if self._connection is None:
|
|
128
|
-
raise ValueError("Store connection is not available")
|
|
129
125
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
126
|
+
# Set current version in settings
|
|
127
|
+
current_version = metadata.version("haiku.rag")
|
|
128
|
+
self.set_haiku_version(current_version)
|
|
133
129
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
130
|
+
# Check if we need to perform upgrades
|
|
131
|
+
try:
|
|
132
|
+
existing_settings = list(
|
|
133
|
+
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
134
|
+
)
|
|
135
|
+
if existing_settings:
|
|
136
|
+
db_version = self.get_haiku_version() # noqa: F841
|
|
137
|
+
# TODO: Add upgrade logic here similar to SQLite version when needed
|
|
138
|
+
except Exception:
|
|
139
|
+
# Settings table might not exist yet in fresh databases
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
def get_haiku_version(self) -> str:
|
|
143
|
+
"""Returns the user version stored in settings."""
|
|
144
|
+
settings_records = list(
|
|
145
|
+
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
146
|
+
)
|
|
147
|
+
if settings_records:
|
|
148
|
+
settings = (
|
|
149
|
+
json.loads(settings_records[0].settings)
|
|
150
|
+
if settings_records[0].settings
|
|
151
|
+
else {}
|
|
152
|
+
)
|
|
153
|
+
return settings.get("version", "0.0.0")
|
|
154
|
+
return "0.0.0"
|
|
138
155
|
|
|
139
|
-
|
|
140
|
-
|
|
156
|
+
def set_haiku_version(self, version: str) -> None:
|
|
157
|
+
"""Updates the user version in settings."""
|
|
158
|
+
settings_records = list(
|
|
159
|
+
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
141
160
|
)
|
|
161
|
+
if settings_records:
|
|
162
|
+
settings = (
|
|
163
|
+
json.loads(settings_records[0].settings)
|
|
164
|
+
if settings_records[0].settings
|
|
165
|
+
else {}
|
|
166
|
+
)
|
|
167
|
+
settings["version"] = version
|
|
168
|
+
# Update the record
|
|
169
|
+
self.settings_table.update(
|
|
170
|
+
where="id = 'settings'", values={"settings": json.dumps(settings)}
|
|
171
|
+
)
|
|
172
|
+
else:
|
|
173
|
+
# Create new settings record
|
|
174
|
+
settings_data = Config.model_dump(mode="json")
|
|
175
|
+
settings_data["version"] = version
|
|
176
|
+
self.settings_table.add(
|
|
177
|
+
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
178
|
+
)
|
|
142
179
|
|
|
143
180
|
def recreate_embeddings_table(self) -> None:
|
|
144
|
-
"""Recreate the
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
# Recreate with current dimensions
|
|
152
|
-
embedder = get_embedder()
|
|
153
|
-
self._connection.execute(f"""
|
|
154
|
-
CREATE VIRTUAL TABLE chunk_embeddings USING vec0(
|
|
155
|
-
chunk_id INTEGER PRIMARY KEY,
|
|
156
|
-
embedding FLOAT[{embedder._vector_dim}]
|
|
157
|
-
)
|
|
158
|
-
""")
|
|
181
|
+
"""Recreate the chunks table with current vector dimensions."""
|
|
182
|
+
# Drop and recreate chunks table
|
|
183
|
+
try:
|
|
184
|
+
self.db.drop_table("chunks")
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
159
187
|
|
|
160
|
-
|
|
188
|
+
# Update the ChunkRecord model with new vector dimension
|
|
189
|
+
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
190
|
+
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
161
191
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
"""Serialize a list of floats to bytes for sqlite-vec storage."""
|
|
165
|
-
return struct.pack(f"{len(embedding)}f", *embedding)
|
|
192
|
+
# Create FTS index on the new table
|
|
193
|
+
self.chunks_table.create_fts_index("content", replace=True)
|
|
166
194
|
|
|
167
195
|
def close(self):
|
|
168
|
-
"""Close the database connection
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
196
|
+
"""Close the database connection."""
|
|
197
|
+
# LanceDB connections are automatically managed
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def _connection(self):
|
|
202
|
+
"""Compatibility property for repositories expecting _connection."""
|
|
203
|
+
return self
|
haiku/rag/store/models/chunk.py
CHANGED
|
@@ -6,8 +6,8 @@ class Chunk(BaseModel):
|
|
|
6
6
|
Represents a chunk with content, metadata, and optional document information.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
id:
|
|
10
|
-
document_id:
|
|
9
|
+
id: str | None = None
|
|
10
|
+
document_id: str | None = None
|
|
11
11
|
content: str
|
|
12
12
|
metadata: dict = {}
|
|
13
13
|
document_uri: str | None = None
|