haiku.rag 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/app.py +4 -4
- haiku/rag/cli.py +38 -27
- haiku/rag/client.py +19 -23
- haiku/rag/config.py +6 -2
- haiku/rag/logging.py +4 -0
- haiku/rag/mcp.py +12 -9
- haiku/rag/migration.py +316 -0
- haiku/rag/reranking/__init__.py +0 -6
- haiku/rag/store/engine.py +173 -141
- haiku/rag/store/models/chunk.py +2 -2
- haiku/rag/store/models/document.py +1 -1
- haiku/rag/store/repositories/__init__.py +6 -2
- haiku/rag/store/repositories/chunk.py +279 -414
- haiku/rag/store/repositories/document.py +171 -205
- haiku/rag/store/repositories/settings.py +115 -49
- haiku/rag/store/upgrades/__init__.py +1 -3
- haiku/rag/utils.py +39 -31
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/METADATA +21 -16
- haiku_rag-0.7.0.dist-info/RECORD +39 -0
- haiku/rag/reranking/ollama.py +0 -81
- haiku/rag/store/repositories/base.py +0 -40
- haiku/rag/store/upgrades/v0_3_4.py +0 -26
- haiku_rag-0.6.0.dist-info/RECORD +0 -41
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/WHEEL +0 -0
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,516 +1,381 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import json
|
|
2
|
-
import
|
|
3
|
+
import logging
|
|
4
|
+
from uuid import uuid4
|
|
3
5
|
|
|
4
6
|
from docling_core.types.doc.document import DoclingDocument
|
|
7
|
+
from lancedb.rerankers import RRFReranker
|
|
5
8
|
|
|
6
9
|
from haiku.rag.chunker import chunker
|
|
10
|
+
from haiku.rag.config import Config
|
|
7
11
|
from haiku.rag.embeddings import get_embedder
|
|
12
|
+
from haiku.rag.store.engine import DocumentRecord, Store
|
|
8
13
|
from haiku.rag.store.models.chunk import Chunk
|
|
9
|
-
from haiku.rag.store.repositories.base import BaseRepository
|
|
10
14
|
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
11
16
|
|
|
12
|
-
class ChunkRepository(BaseRepository[Chunk]):
|
|
13
|
-
"""Repository for Chunk database operations."""
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
self.embedder = get_embedder()
|
|
18
|
+
class ChunkRepository:
|
|
19
|
+
"""Repository for Chunk operations."""
|
|
18
20
|
|
|
19
|
-
|
|
21
|
+
def __init__(self, store: Store) -> None:
|
|
22
|
+
self.store = store
|
|
23
|
+
self.embedder = get_embedder()
|
|
24
|
+
self._optimize_lock = asyncio.Lock()
|
|
25
|
+
|
|
26
|
+
def _ensure_fts_index(self) -> None:
|
|
27
|
+
"""Ensure FTS index exists on the content column."""
|
|
28
|
+
try:
|
|
29
|
+
self.store.chunks_table.create_fts_index("content", replace=True)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
# Log the error but don't fail - FTS might already exist
|
|
32
|
+
logger.debug(f"FTS index creation skipped: {e}")
|
|
33
|
+
|
|
34
|
+
async def _optimize(self) -> None:
|
|
35
|
+
"""Optimize the chunks table to refresh indexes."""
|
|
36
|
+
# Skip optimization for LanceDB Cloud as it handles this automatically
|
|
37
|
+
if Config.LANCEDB_URI and Config.LANCEDB_URI.startswith("db://"):
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
async with self._optimize_lock:
|
|
41
|
+
try:
|
|
42
|
+
self.store.chunks_table.optimize()
|
|
43
|
+
except (RuntimeError, OSError) as e:
|
|
44
|
+
# Handle "too many open files" and other resource errors gracefully
|
|
45
|
+
logger.debug(
|
|
46
|
+
f"Table optimization skipped due to resource constraints: {e}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
async def create(self, entity: Chunk) -> Chunk:
|
|
20
50
|
"""Create a chunk in the database."""
|
|
21
|
-
|
|
22
|
-
raise ValueError("Store connection is not available")
|
|
23
|
-
if entity.document_id is None:
|
|
24
|
-
raise ValueError("Chunk must have a document_id to be created")
|
|
25
|
-
|
|
26
|
-
cursor = self.store._connection.cursor()
|
|
27
|
-
cursor.execute(
|
|
28
|
-
"""
|
|
29
|
-
INSERT INTO chunks (document_id, content, metadata)
|
|
30
|
-
VALUES (:document_id, :content, :metadata)
|
|
31
|
-
""",
|
|
32
|
-
{
|
|
33
|
-
"document_id": entity.document_id,
|
|
34
|
-
"content": entity.content,
|
|
35
|
-
"metadata": json.dumps(entity.metadata),
|
|
36
|
-
},
|
|
37
|
-
)
|
|
51
|
+
assert entity.document_id, "Chunk must have a document_id to be created"
|
|
38
52
|
|
|
39
|
-
|
|
53
|
+
chunk_id = str(uuid4())
|
|
40
54
|
|
|
41
|
-
# Generate
|
|
55
|
+
# Generate embedding if not provided
|
|
42
56
|
if entity.embedding is not None:
|
|
43
|
-
|
|
44
|
-
serialized_embedding = self.store.serialize_embedding(entity.embedding)
|
|
57
|
+
embedding = entity.embedding
|
|
45
58
|
else:
|
|
46
|
-
# Generate embedding from content
|
|
47
59
|
embedding = await self.embedder.embed(entity.content)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
""",
|
|
55
|
-
{"chunk_id": entity.id, "embedding": serialized_embedding},
|
|
60
|
+
chunk_record = self.store.ChunkRecord(
|
|
61
|
+
id=chunk_id,
|
|
62
|
+
document_id=entity.document_id,
|
|
63
|
+
content=entity.content,
|
|
64
|
+
metadata=json.dumps(entity.metadata),
|
|
65
|
+
vector=embedding,
|
|
56
66
|
)
|
|
57
67
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
)
|
|
68
|
+
self.store.chunks_table.add([chunk_record])
|
|
69
|
+
|
|
70
|
+
entity.id = chunk_id
|
|
71
|
+
|
|
72
|
+
# Try to optimize if not currently locked (non-blocking)
|
|
73
|
+
if not self._optimize_lock.locked():
|
|
74
|
+
asyncio.create_task(self._optimize())
|
|
66
75
|
|
|
67
|
-
if commit:
|
|
68
|
-
self.store._connection.commit()
|
|
69
76
|
return entity
|
|
70
77
|
|
|
71
|
-
async def get_by_id(self, entity_id:
|
|
78
|
+
async def get_by_id(self, entity_id: str) -> Chunk | None:
|
|
72
79
|
"""Get a chunk by its ID."""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
"""
|
|
79
|
-
SELECT id, document_id, content, metadata
|
|
80
|
-
FROM chunks WHERE id = :id
|
|
81
|
-
""",
|
|
82
|
-
{"id": entity_id},
|
|
80
|
+
results = list(
|
|
81
|
+
self.store.chunks_table.search()
|
|
82
|
+
.where(f"id = '{entity_id}'")
|
|
83
|
+
.limit(1)
|
|
84
|
+
.to_pydantic(self.store.ChunkRecord)
|
|
83
85
|
)
|
|
84
86
|
|
|
85
|
-
|
|
86
|
-
if row is None:
|
|
87
|
+
if not results:
|
|
87
88
|
return None
|
|
88
89
|
|
|
89
|
-
|
|
90
|
-
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
91
|
-
|
|
90
|
+
chunk_record = results[0]
|
|
92
91
|
return Chunk(
|
|
93
|
-
id=
|
|
92
|
+
id=chunk_record.id,
|
|
93
|
+
document_id=chunk_record.document_id,
|
|
94
|
+
content=chunk_record.content,
|
|
95
|
+
metadata=json.loads(chunk_record.metadata) if chunk_record.metadata else {},
|
|
94
96
|
)
|
|
95
97
|
|
|
96
98
|
async def update(self, entity: Chunk) -> Chunk:
|
|
97
99
|
"""Update an existing chunk."""
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
"""
|
|
106
|
-
UPDATE chunks
|
|
107
|
-
SET document_id = :document_id, content = :content, metadata = :metadata
|
|
108
|
-
WHERE id = :id
|
|
109
|
-
""",
|
|
110
|
-
{
|
|
100
|
+
assert entity.id, "Chunk ID is required for update"
|
|
101
|
+
|
|
102
|
+
embedding = await self.embedder.embed(entity.content)
|
|
103
|
+
|
|
104
|
+
self.store.chunks_table.update(
|
|
105
|
+
where=f"id = '{entity.id}'",
|
|
106
|
+
values={
|
|
111
107
|
"document_id": entity.document_id,
|
|
112
108
|
"content": entity.content,
|
|
113
109
|
"metadata": json.dumps(entity.metadata),
|
|
114
|
-
"
|
|
110
|
+
"vector": embedding,
|
|
115
111
|
},
|
|
116
112
|
)
|
|
113
|
+
# Try to optimize if not currently locked (non-blocking)
|
|
114
|
+
if not self._optimize_lock.locked():
|
|
115
|
+
asyncio.create_task(self._optimize())
|
|
117
116
|
|
|
118
|
-
# Regenerate and update embedding
|
|
119
|
-
embedding = await self.embedder.embed(entity.content)
|
|
120
|
-
serialized_embedding = self.store.serialize_embedding(embedding)
|
|
121
|
-
cursor.execute(
|
|
122
|
-
"""
|
|
123
|
-
UPDATE chunk_embeddings
|
|
124
|
-
SET embedding = :embedding
|
|
125
|
-
WHERE chunk_id = :chunk_id
|
|
126
|
-
""",
|
|
127
|
-
{"embedding": serialized_embedding, "chunk_id": entity.id},
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
# Update FTS5 table
|
|
131
|
-
cursor.execute(
|
|
132
|
-
"""
|
|
133
|
-
UPDATE chunks_fts
|
|
134
|
-
SET content = :content
|
|
135
|
-
WHERE rowid = :rowid
|
|
136
|
-
""",
|
|
137
|
-
{"content": entity.content, "rowid": entity.id},
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
self.store._connection.commit()
|
|
141
117
|
return entity
|
|
142
118
|
|
|
143
|
-
async def delete(self, entity_id:
|
|
119
|
+
async def delete(self, entity_id: str) -> bool:
|
|
144
120
|
"""Delete a chunk by its ID."""
|
|
145
|
-
|
|
146
|
-
|
|
121
|
+
chunk = await self.get_by_id(entity_id)
|
|
122
|
+
if chunk is None:
|
|
123
|
+
return False
|
|
147
124
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
# Delete from FTS5 table first
|
|
151
|
-
cursor.execute(
|
|
152
|
-
"DELETE FROM chunks_fts WHERE rowid = :rowid", {"rowid": entity_id}
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
# Delete the embedding
|
|
156
|
-
cursor.execute(
|
|
157
|
-
"DELETE FROM chunk_embeddings WHERE chunk_id = :chunk_id",
|
|
158
|
-
{"chunk_id": entity_id},
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
# Delete the chunk
|
|
162
|
-
cursor.execute("DELETE FROM chunks WHERE id = :id", {"id": entity_id})
|
|
163
|
-
|
|
164
|
-
deleted = cursor.rowcount > 0
|
|
165
|
-
if commit:
|
|
166
|
-
self.store._connection.commit()
|
|
167
|
-
return deleted
|
|
125
|
+
self.store.chunks_table.delete(f"id = '{entity_id}'")
|
|
126
|
+
return True
|
|
168
127
|
|
|
169
128
|
async def list_all(
|
|
170
129
|
self, limit: int | None = None, offset: int | None = None
|
|
171
130
|
) -> list[Chunk]:
|
|
172
131
|
"""List all chunks with optional pagination."""
|
|
173
|
-
|
|
174
|
-
raise ValueError("Store connection is not available")
|
|
175
|
-
|
|
176
|
-
cursor = self.store._connection.cursor()
|
|
177
|
-
query = "SELECT id, document_id, content, metadata FROM chunks ORDER BY document_id, id"
|
|
178
|
-
params = {}
|
|
179
|
-
|
|
180
|
-
if limit is not None:
|
|
181
|
-
query += " LIMIT :limit"
|
|
182
|
-
params["limit"] = limit
|
|
132
|
+
query = self.store.chunks_table.search()
|
|
183
133
|
|
|
184
134
|
if offset is not None:
|
|
185
|
-
query
|
|
186
|
-
|
|
135
|
+
query = query.offset(offset)
|
|
136
|
+
if limit is not None:
|
|
137
|
+
query = query.limit(limit)
|
|
187
138
|
|
|
188
|
-
|
|
189
|
-
rows = cursor.fetchall()
|
|
139
|
+
results = list(query.to_pydantic(self.store.ChunkRecord))
|
|
190
140
|
|
|
191
141
|
return [
|
|
192
142
|
Chunk(
|
|
193
|
-
id=
|
|
194
|
-
document_id=document_id,
|
|
195
|
-
content=content,
|
|
196
|
-
metadata=json.loads(
|
|
143
|
+
id=chunk.id,
|
|
144
|
+
document_id=chunk.document_id,
|
|
145
|
+
content=chunk.content,
|
|
146
|
+
metadata=json.loads(chunk.metadata) if chunk.metadata else {},
|
|
197
147
|
)
|
|
198
|
-
for
|
|
148
|
+
for chunk in results
|
|
199
149
|
]
|
|
200
150
|
|
|
201
151
|
async def create_chunks_for_document(
|
|
202
|
-
self, document_id:
|
|
152
|
+
self, document_id: str, document: DoclingDocument
|
|
203
153
|
) -> list[Chunk]:
|
|
204
154
|
"""Create chunks and embeddings for a document from DoclingDocument."""
|
|
205
|
-
# Chunk the document content
|
|
206
155
|
chunk_texts = await chunker.chunk(document)
|
|
156
|
+
|
|
157
|
+
# Generate embeddings in parallel for all chunks
|
|
158
|
+
embeddings_tasks = []
|
|
159
|
+
for chunk_text in chunk_texts:
|
|
160
|
+
embeddings_tasks.append(self.embedder.embed(chunk_text))
|
|
161
|
+
|
|
162
|
+
# Wait for all embeddings to complete
|
|
163
|
+
embeddings = await asyncio.gather(*embeddings_tasks)
|
|
164
|
+
|
|
165
|
+
# Prepare all chunk records for batch insertion
|
|
166
|
+
chunk_records = []
|
|
207
167
|
created_chunks = []
|
|
208
168
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
169
|
+
for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
|
|
170
|
+
chunk_id = str(uuid4())
|
|
171
|
+
|
|
172
|
+
chunk_record = self.store.ChunkRecord(
|
|
173
|
+
id=chunk_id,
|
|
174
|
+
document_id=document_id,
|
|
175
|
+
content=chunk_text,
|
|
176
|
+
metadata=json.dumps({"order": order}),
|
|
177
|
+
vector=embedding,
|
|
178
|
+
)
|
|
179
|
+
chunk_records.append(chunk_record)
|
|
180
|
+
|
|
212
181
|
chunk = Chunk(
|
|
213
|
-
|
|
182
|
+
id=chunk_id,
|
|
183
|
+
document_id=document_id,
|
|
184
|
+
content=chunk_text,
|
|
185
|
+
metadata={"order": order},
|
|
214
186
|
)
|
|
187
|
+
created_chunks.append(chunk)
|
|
215
188
|
|
|
216
|
-
|
|
217
|
-
|
|
189
|
+
# Batch insert all chunks at once
|
|
190
|
+
if chunk_records:
|
|
191
|
+
self.store.chunks_table.add(chunk_records)
|
|
218
192
|
|
|
193
|
+
# Force optimization once at the end for bulk operations
|
|
194
|
+
await self._optimize()
|
|
219
195
|
return created_chunks
|
|
220
196
|
|
|
221
|
-
async def delete_all(self
|
|
197
|
+
async def delete_all(self) -> None:
|
|
222
198
|
"""Delete all chunks from the database."""
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
cursor.execute("DELETE FROM chunks")
|
|
231
|
-
|
|
232
|
-
deleted = cursor.rowcount > 0
|
|
233
|
-
if commit:
|
|
234
|
-
self.store._connection.commit()
|
|
235
|
-
return deleted
|
|
199
|
+
# Drop and recreate table to clear all data
|
|
200
|
+
self.store.db.drop_table("chunks")
|
|
201
|
+
self.store.chunks_table = self.store.db.create_table(
|
|
202
|
+
"chunks", schema=self.store.ChunkRecord
|
|
203
|
+
)
|
|
204
|
+
# Create FTS index on the new table
|
|
205
|
+
self.store.chunks_table.create_fts_index("content", replace=True)
|
|
236
206
|
|
|
237
|
-
async def delete_by_document_id(
|
|
238
|
-
self, document_id: int, commit: bool = True
|
|
239
|
-
) -> bool:
|
|
207
|
+
async def delete_by_document_id(self, document_id: str) -> bool:
|
|
240
208
|
"""Delete all chunks for a document."""
|
|
241
209
|
chunks = await self.get_by_document_id(document_id)
|
|
242
210
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
if chunk.id is not None:
|
|
246
|
-
deleted = await self.delete(chunk.id, commit=False)
|
|
247
|
-
deleted_any = deleted_any or deleted
|
|
211
|
+
if not chunks:
|
|
212
|
+
return False
|
|
248
213
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
return deleted_any
|
|
214
|
+
self.store.chunks_table.delete(f"document_id = '{document_id}'")
|
|
215
|
+
return True
|
|
252
216
|
|
|
253
|
-
async def
|
|
254
|
-
self, query: str, limit: int = 5
|
|
217
|
+
async def search(
|
|
218
|
+
self, query: str, limit: int = 5, search_type: str = "hybrid"
|
|
255
219
|
) -> list[tuple[Chunk, float]]:
|
|
256
|
-
"""Search for relevant chunks using
|
|
257
|
-
if self.store._connection is None:
|
|
258
|
-
raise ValueError("Store connection is not available")
|
|
259
|
-
|
|
260
|
-
cursor = self.store._connection.cursor()
|
|
261
|
-
|
|
262
|
-
# Generate embedding for the query
|
|
263
|
-
query_embedding = await self.embedder.embed(query)
|
|
264
|
-
serialized_query_embedding = self.store.serialize_embedding(query_embedding)
|
|
265
|
-
|
|
266
|
-
# Search for similar chunks using sqlite-vec
|
|
267
|
-
cursor.execute(
|
|
268
|
-
"""
|
|
269
|
-
SELECT c.id, c.document_id, c.content, c.metadata, distance, d.uri, d.metadata as document_metadata
|
|
270
|
-
FROM chunk_embeddings
|
|
271
|
-
JOIN chunks c ON c.id = chunk_embeddings.chunk_id
|
|
272
|
-
JOIN documents d ON c.document_id = d.id
|
|
273
|
-
WHERE embedding MATCH :embedding AND k = :k
|
|
274
|
-
ORDER BY distance
|
|
275
|
-
""",
|
|
276
|
-
{"embedding": serialized_query_embedding, "k": limit},
|
|
277
|
-
)
|
|
220
|
+
"""Search for relevant chunks using the specified search method.
|
|
278
221
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
id=chunk_id,
|
|
284
|
-
document_id=document_id,
|
|
285
|
-
content=content,
|
|
286
|
-
metadata=json.loads(metadata_json) if metadata_json else {},
|
|
287
|
-
document_uri=document_uri,
|
|
288
|
-
document_meta=json.loads(document_metadata_json)
|
|
289
|
-
if document_metadata_json
|
|
290
|
-
else {},
|
|
291
|
-
),
|
|
292
|
-
1.0 / (1.0 + distance),
|
|
293
|
-
)
|
|
294
|
-
for chunk_id, document_id, content, metadata_json, distance, document_uri, document_metadata_json in results
|
|
295
|
-
]
|
|
222
|
+
Args:
|
|
223
|
+
query: The search query string.
|
|
224
|
+
limit: Maximum number of results to return.
|
|
225
|
+
search_type: Type of search - "vector", "fts", or "hybrid" (default).
|
|
296
226
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
raise ValueError("Store connection is not available")
|
|
303
|
-
|
|
304
|
-
cursor = self.store._connection.cursor()
|
|
305
|
-
|
|
306
|
-
# Clean the query for FTS5 - extract keywords for better matching
|
|
307
|
-
# Remove special characters and split into words
|
|
308
|
-
words = re.findall(r"\b\w+\b", query.lower())
|
|
309
|
-
# Join with OR to find chunks containing any of the keywords
|
|
310
|
-
fts_query = " OR ".join(words) if words else query
|
|
311
|
-
|
|
312
|
-
# Search using FTS5
|
|
313
|
-
cursor.execute(
|
|
314
|
-
"""
|
|
315
|
-
SELECT c.id, c.document_id, c.content, c.metadata, rank, d.uri, d.metadata as document_metadata
|
|
316
|
-
FROM chunks_fts
|
|
317
|
-
JOIN chunks c ON c.id = chunks_fts.rowid
|
|
318
|
-
JOIN documents d ON c.document_id = d.id
|
|
319
|
-
WHERE chunks_fts MATCH :query
|
|
320
|
-
ORDER BY rank
|
|
321
|
-
LIMIT :limit
|
|
322
|
-
""",
|
|
323
|
-
{"query": fts_query, "limit": limit},
|
|
324
|
-
)
|
|
227
|
+
Returns:
|
|
228
|
+
List of (chunk, score) tuples ordered by relevance.
|
|
229
|
+
"""
|
|
230
|
+
if not query.strip():
|
|
231
|
+
return []
|
|
325
232
|
|
|
326
|
-
|
|
233
|
+
if search_type == "vector":
|
|
234
|
+
query_embedding = await self.embedder.embed(query)
|
|
327
235
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
id=chunk_id,
|
|
332
|
-
document_id=document_id,
|
|
333
|
-
content=content,
|
|
334
|
-
metadata=json.loads(metadata_json) if metadata_json else {},
|
|
335
|
-
document_uri=document_uri,
|
|
336
|
-
document_meta=json.loads(document_metadata_json)
|
|
337
|
-
if document_metadata_json
|
|
338
|
-
else {},
|
|
339
|
-
),
|
|
340
|
-
-rank,
|
|
341
|
-
)
|
|
342
|
-
for chunk_id, document_id, content, metadata_json, rank, document_uri, document_metadata_json in results
|
|
343
|
-
# FTS5 rank is negative BM25 score
|
|
344
|
-
]
|
|
236
|
+
results = self.store.chunks_table.search(
|
|
237
|
+
query_embedding, query_type="vector", vector_column_name="vector"
|
|
238
|
+
).limit(limit)
|
|
345
239
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
raise ValueError("Store connection is not available")
|
|
352
|
-
|
|
353
|
-
cursor = self.store._connection.cursor()
|
|
354
|
-
|
|
355
|
-
# Generate embedding for the query
|
|
356
|
-
query_embedding = await self.embedder.embed(query)
|
|
357
|
-
serialized_query_embedding = self.store.serialize_embedding(query_embedding)
|
|
358
|
-
|
|
359
|
-
# Clean the query for FTS5 - extract keywords for better matching
|
|
360
|
-
# Remove special characters and split into words
|
|
361
|
-
words = re.findall(r"\b\w+\b", query.lower())
|
|
362
|
-
# Join with OR to find chunks containing any of the keywords
|
|
363
|
-
fts_query = " OR ".join(words) if words else query
|
|
364
|
-
# Perform hybrid search using RRF (Reciprocal Rank Fusion)
|
|
365
|
-
cursor.execute(
|
|
366
|
-
"""
|
|
367
|
-
WITH vector_search AS (
|
|
368
|
-
SELECT
|
|
369
|
-
c.id,
|
|
370
|
-
c.document_id,
|
|
371
|
-
c.content,
|
|
372
|
-
c.metadata,
|
|
373
|
-
ROW_NUMBER() OVER (ORDER BY ce.distance) as vector_rank
|
|
374
|
-
FROM chunk_embeddings ce
|
|
375
|
-
JOIN chunks c ON c.id = ce.chunk_id
|
|
376
|
-
WHERE ce.embedding MATCH :embedding AND k = :k_vector
|
|
377
|
-
ORDER BY ce.distance
|
|
378
|
-
),
|
|
379
|
-
fts_search AS (
|
|
380
|
-
SELECT
|
|
381
|
-
c.id,
|
|
382
|
-
c.document_id,
|
|
383
|
-
c.content,
|
|
384
|
-
c.metadata,
|
|
385
|
-
ROW_NUMBER() OVER (ORDER BY chunks_fts.rank) as fts_rank
|
|
386
|
-
FROM chunks_fts
|
|
387
|
-
JOIN chunks c ON c.id = chunks_fts.rowid
|
|
388
|
-
WHERE chunks_fts MATCH :fts_query
|
|
389
|
-
ORDER BY chunks_fts.rank
|
|
390
|
-
),
|
|
391
|
-
all_chunks AS (
|
|
392
|
-
SELECT id, document_id, content, metadata FROM vector_search
|
|
393
|
-
UNION
|
|
394
|
-
SELECT id, document_id, content, metadata FROM fts_search
|
|
395
|
-
),
|
|
396
|
-
rrf_scores AS (
|
|
397
|
-
SELECT
|
|
398
|
-
a.id,
|
|
399
|
-
a.document_id,
|
|
400
|
-
a.content,
|
|
401
|
-
a.metadata,
|
|
402
|
-
COALESCE(1.0 / (:k + v.vector_rank), 0) + COALESCE(1.0 / (:k + f.fts_rank), 0) as rrf_score
|
|
403
|
-
FROM all_chunks a
|
|
404
|
-
LEFT JOIN vector_search v ON a.id = v.id
|
|
405
|
-
LEFT JOIN fts_search f ON a.id = f.id
|
|
240
|
+
return await self._process_search_results(results)
|
|
241
|
+
|
|
242
|
+
elif search_type == "fts":
|
|
243
|
+
results = self.store.chunks_table.search(query, query_type="fts").limit(
|
|
244
|
+
limit
|
|
406
245
|
)
|
|
407
|
-
|
|
408
|
-
FROM rrf_scores r
|
|
409
|
-
JOIN documents d ON r.document_id = d.id
|
|
410
|
-
ORDER BY r.rrf_score DESC
|
|
411
|
-
LIMIT :limit
|
|
412
|
-
""",
|
|
413
|
-
{
|
|
414
|
-
"embedding": serialized_query_embedding,
|
|
415
|
-
"k_vector": limit * 3,
|
|
416
|
-
"fts_query": fts_query,
|
|
417
|
-
"k": k,
|
|
418
|
-
"limit": limit,
|
|
419
|
-
},
|
|
420
|
-
)
|
|
246
|
+
return await self._process_search_results(results)
|
|
421
247
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
)
|
|
435
|
-
rrf_score,
|
|
248
|
+
else: # hybrid (default)
|
|
249
|
+
query_embedding = await self.embedder.embed(query)
|
|
250
|
+
|
|
251
|
+
# Create RRF reranker
|
|
252
|
+
reranker = RRFReranker()
|
|
253
|
+
|
|
254
|
+
# Perform native hybrid search with RRF reranking
|
|
255
|
+
results = (
|
|
256
|
+
self.store.chunks_table.search(query_type="hybrid")
|
|
257
|
+
.vector(query_embedding)
|
|
258
|
+
.text(query)
|
|
259
|
+
.rerank(reranker)
|
|
260
|
+
.limit(limit)
|
|
436
261
|
)
|
|
437
|
-
|
|
438
|
-
]
|
|
262
|
+
return await self._process_search_results(results)
|
|
439
263
|
|
|
440
|
-
async def get_by_document_id(self, document_id:
|
|
264
|
+
async def get_by_document_id(self, document_id: str) -> list[Chunk]:
|
|
441
265
|
"""Get all chunks for a specific document."""
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
cursor.execute(
|
|
447
|
-
"""
|
|
448
|
-
SELECT c.id, c.document_id, c.content, c.metadata, d.uri, d.metadata as document_metadata
|
|
449
|
-
FROM chunks c
|
|
450
|
-
JOIN documents d ON c.document_id = d.id
|
|
451
|
-
WHERE c.document_id = :document_id
|
|
452
|
-
ORDER BY JSON_EXTRACT(c.metadata, '$.order')
|
|
453
|
-
""",
|
|
454
|
-
{"document_id": document_id},
|
|
266
|
+
results = list(
|
|
267
|
+
self.store.chunks_table.search()
|
|
268
|
+
.where(f"document_id = '{document_id}'")
|
|
269
|
+
.to_pydantic(self.store.ChunkRecord)
|
|
455
270
|
)
|
|
456
271
|
|
|
457
|
-
|
|
458
|
-
|
|
272
|
+
# Get document info
|
|
273
|
+
doc_results = list(
|
|
274
|
+
self.store.documents_table.search()
|
|
275
|
+
.where(f"id = '{document_id}'")
|
|
276
|
+
.limit(1)
|
|
277
|
+
.to_pydantic(DocumentRecord)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
doc_uri = doc_results[0].uri if doc_results else None
|
|
281
|
+
doc_meta = doc_results[0].metadata if doc_results else "{}"
|
|
282
|
+
|
|
283
|
+
# Sort by order in metadata
|
|
284
|
+
chunks = [
|
|
459
285
|
Chunk(
|
|
460
|
-
id=
|
|
461
|
-
document_id=document_id,
|
|
462
|
-
content=content,
|
|
463
|
-
metadata=json.loads(
|
|
464
|
-
document_uri=
|
|
465
|
-
document_meta=json.loads(
|
|
466
|
-
if document_metadata_json
|
|
467
|
-
else {},
|
|
286
|
+
id=chunk.id,
|
|
287
|
+
document_id=chunk.document_id,
|
|
288
|
+
content=chunk.content,
|
|
289
|
+
metadata=json.loads(chunk.metadata) if chunk.metadata else {},
|
|
290
|
+
document_uri=doc_uri,
|
|
291
|
+
document_meta=json.loads(doc_meta) if doc_meta else {},
|
|
468
292
|
)
|
|
469
|
-
for
|
|
293
|
+
for chunk in results
|
|
470
294
|
]
|
|
471
295
|
|
|
296
|
+
chunks.sort(key=lambda c: c.metadata.get("order", 0))
|
|
297
|
+
return chunks
|
|
298
|
+
|
|
472
299
|
async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
|
|
473
300
|
"""Get adjacent chunks before and after the given chunk within the same document."""
|
|
474
|
-
|
|
475
|
-
raise ValueError("Store connection is not available")
|
|
476
|
-
if chunk.document_id is None:
|
|
477
|
-
return []
|
|
301
|
+
assert chunk.document_id, "Document id is required for adjacent chunk finding"
|
|
478
302
|
|
|
479
|
-
cursor = self.store._connection.cursor()
|
|
480
303
|
chunk_order = chunk.metadata.get("order")
|
|
481
304
|
if chunk_order is None:
|
|
482
305
|
return []
|
|
483
306
|
|
|
484
|
-
# Get
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
)
|
|
307
|
+
# Get all chunks for the document
|
|
308
|
+
all_chunks = await self.get_by_document_id(chunk.document_id)
|
|
309
|
+
|
|
310
|
+
# Filter to adjacent chunks
|
|
311
|
+
adjacent_chunks = []
|
|
312
|
+
for c in all_chunks:
|
|
313
|
+
c_order = c.metadata.get("order", 0)
|
|
314
|
+
if c.id != chunk.id and abs(c_order - chunk_order) <= num_adjacent:
|
|
315
|
+
adjacent_chunks.append(c)
|
|
316
|
+
|
|
317
|
+
return adjacent_chunks
|
|
318
|
+
|
|
319
|
+
async def _process_search_results(self, query_result) -> list[tuple[Chunk, float]]:
|
|
320
|
+
"""Process search results into chunks with document info and scores."""
|
|
321
|
+
chunks_with_scores = []
|
|
322
|
+
|
|
323
|
+
# Get both arrow and pydantic results to access scores
|
|
324
|
+
arrow_result = query_result.to_arrow()
|
|
325
|
+
pydantic_results = list(query_result.to_pydantic(self.store.ChunkRecord))
|
|
326
|
+
|
|
327
|
+
# Extract scores from arrow result based on search type
|
|
328
|
+
scores = []
|
|
329
|
+
column_names = arrow_result.column_names
|
|
330
|
+
|
|
331
|
+
if "_distance" in column_names:
|
|
332
|
+
# Vector search - distance (lower is better, convert to similarity)
|
|
333
|
+
distances = arrow_result.column("_distance").to_pylist()
|
|
334
|
+
scores = [max(0.0, 1.0 / (1.0 + dist)) for dist in distances]
|
|
335
|
+
elif "_relevance_score" in column_names:
|
|
336
|
+
# Hybrid search - relevance score (higher is better)
|
|
337
|
+
scores = arrow_result.column("_relevance_score").to_pylist()
|
|
338
|
+
elif "_score" in column_names:
|
|
339
|
+
# FTS search - score (higher is better)
|
|
340
|
+
scores = arrow_result.column("_score").to_pylist()
|
|
341
|
+
else:
|
|
342
|
+
raise ValueError("Unknown search result format, cannot extract scores")
|
|
343
|
+
|
|
344
|
+
# Collect all unique document IDs for batch lookup
|
|
345
|
+
document_ids = list(set(chunk.document_id for chunk in pydantic_results))
|
|
346
|
+
|
|
347
|
+
# Batch fetch all documents at once
|
|
348
|
+
documents_map = {}
|
|
349
|
+
if document_ids:
|
|
350
|
+
# Create a WHERE clause for all document IDs
|
|
351
|
+
where_clause = " OR ".join(f"id = '{doc_id}'" for doc_id in document_ids)
|
|
352
|
+
doc_results = list(
|
|
353
|
+
self.store.documents_table.search()
|
|
354
|
+
.where(where_clause)
|
|
355
|
+
.to_pydantic(DocumentRecord)
|
|
356
|
+
)
|
|
357
|
+
documents_map = {doc.id: doc for doc in doc_results}
|
|
502
358
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
359
|
+
for i, chunk_record in enumerate(pydantic_results):
|
|
360
|
+
# Get document info from pre-fetched map
|
|
361
|
+
doc = documents_map.get(chunk_record.document_id)
|
|
362
|
+
doc_uri = doc.uri if doc else None
|
|
363
|
+
doc_meta = doc.metadata if doc else "{}"
|
|
364
|
+
|
|
365
|
+
chunk = Chunk(
|
|
366
|
+
id=chunk_record.id,
|
|
367
|
+
document_id=chunk_record.document_id,
|
|
368
|
+
content=chunk_record.content,
|
|
369
|
+
metadata=json.loads(chunk_record.metadata)
|
|
370
|
+
if chunk_record.metadata
|
|
513
371
|
else {},
|
|
372
|
+
document_uri=doc_uri,
|
|
373
|
+
document_meta=json.loads(doc_meta) if doc_meta else {},
|
|
514
374
|
)
|
|
515
|
-
|
|
516
|
-
|
|
375
|
+
|
|
376
|
+
# Get score from arrow result
|
|
377
|
+
score = scores[i] if i < len(scores) else 1.0
|
|
378
|
+
|
|
379
|
+
chunks_with_scores.append((chunk, score))
|
|
380
|
+
|
|
381
|
+
return chunks_with_scores
|