haiku.rag 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

@@ -1,516 +1,381 @@
1
+ import asyncio
1
2
  import json
2
- import re
3
+ import logging
4
+ from uuid import uuid4
3
5
 
4
6
  from docling_core.types.doc.document import DoclingDocument
7
+ from lancedb.rerankers import RRFReranker
5
8
 
6
9
  from haiku.rag.chunker import chunker
10
+ from haiku.rag.config import Config
7
11
  from haiku.rag.embeddings import get_embedder
12
+ from haiku.rag.store.engine import DocumentRecord, Store
8
13
  from haiku.rag.store.models.chunk import Chunk
9
- from haiku.rag.store.repositories.base import BaseRepository
10
14
 
15
+ logger = logging.getLogger(__name__)
11
16
 
12
- class ChunkRepository(BaseRepository[Chunk]):
13
- """Repository for Chunk database operations."""
14
17
 
15
- def __init__(self, store):
16
- super().__init__(store)
17
- self.embedder = get_embedder()
18
+ class ChunkRepository:
19
+ """Repository for Chunk operations."""
18
20
 
19
- async def create(self, entity: Chunk, commit: bool = True) -> Chunk:
21
+ def __init__(self, store: Store) -> None:
22
+ self.store = store
23
+ self.embedder = get_embedder()
24
+ self._optimize_lock = asyncio.Lock()
25
+
26
+ def _ensure_fts_index(self) -> None:
27
+ """Ensure FTS index exists on the content column."""
28
+ try:
29
+ self.store.chunks_table.create_fts_index("content", replace=True)
30
+ except Exception as e:
31
+ # Log the error but don't fail - FTS might already exist
32
+ logger.debug(f"FTS index creation skipped: {e}")
33
+
34
+ async def _optimize(self) -> None:
35
+ """Optimize the chunks table to refresh indexes."""
36
+ # Skip optimization for LanceDB Cloud as it handles this automatically
37
+ if Config.LANCEDB_URI and Config.LANCEDB_URI.startswith("db://"):
38
+ return
39
+
40
+ async with self._optimize_lock:
41
+ try:
42
+ self.store.chunks_table.optimize()
43
+ except (RuntimeError, OSError) as e:
44
+ # Handle "too many open files" and other resource errors gracefully
45
+ logger.debug(
46
+ f"Table optimization skipped due to resource constraints: {e}"
47
+ )
48
+
49
+ async def create(self, entity: Chunk) -> Chunk:
20
50
  """Create a chunk in the database."""
21
- if self.store._connection is None:
22
- raise ValueError("Store connection is not available")
23
- if entity.document_id is None:
24
- raise ValueError("Chunk must have a document_id to be created")
25
-
26
- cursor = self.store._connection.cursor()
27
- cursor.execute(
28
- """
29
- INSERT INTO chunks (document_id, content, metadata)
30
- VALUES (:document_id, :content, :metadata)
31
- """,
32
- {
33
- "document_id": entity.document_id,
34
- "content": entity.content,
35
- "metadata": json.dumps(entity.metadata),
36
- },
37
- )
51
+ assert entity.document_id, "Chunk must have a document_id to be created"
38
52
 
39
- entity.id = cursor.lastrowid
53
+ chunk_id = str(uuid4())
40
54
 
41
- # Generate and store embedding - use existing one if provided
55
+ # Generate embedding if not provided
42
56
  if entity.embedding is not None:
43
- # Use the provided embedding
44
- serialized_embedding = self.store.serialize_embedding(entity.embedding)
57
+ embedding = entity.embedding
45
58
  else:
46
- # Generate embedding from content
47
59
  embedding = await self.embedder.embed(entity.content)
48
- serialized_embedding = self.store.serialize_embedding(embedding)
49
-
50
- cursor.execute(
51
- """
52
- INSERT INTO chunk_embeddings (chunk_id, embedding)
53
- VALUES (:chunk_id, :embedding)
54
- """,
55
- {"chunk_id": entity.id, "embedding": serialized_embedding},
60
+ chunk_record = self.store.ChunkRecord(
61
+ id=chunk_id,
62
+ document_id=entity.document_id,
63
+ content=entity.content,
64
+ metadata=json.dumps(entity.metadata),
65
+ vector=embedding,
56
66
  )
57
67
 
58
- # Insert into FTS5 table for full-text search
59
- cursor.execute(
60
- """
61
- INSERT INTO chunks_fts(rowid, content)
62
- VALUES (:rowid, :content)
63
- """,
64
- {"rowid": entity.id, "content": entity.content},
65
- )
68
+ self.store.chunks_table.add([chunk_record])
69
+
70
+ entity.id = chunk_id
71
+
72
+ # Try to optimize if not currently locked (non-blocking)
73
+ if not self._optimize_lock.locked():
74
+ asyncio.create_task(self._optimize())
66
75
 
67
- if commit:
68
- self.store._connection.commit()
69
76
  return entity
70
77
 
71
- async def get_by_id(self, entity_id: int) -> Chunk | None:
78
+ async def get_by_id(self, entity_id: str) -> Chunk | None:
72
79
  """Get a chunk by its ID."""
73
- if self.store._connection is None:
74
- raise ValueError("Store connection is not available")
75
-
76
- cursor = self.store._connection.cursor()
77
- cursor.execute(
78
- """
79
- SELECT id, document_id, content, metadata
80
- FROM chunks WHERE id = :id
81
- """,
82
- {"id": entity_id},
80
+ results = list(
81
+ self.store.chunks_table.search()
82
+ .where(f"id = '{entity_id}'")
83
+ .limit(1)
84
+ .to_pydantic(self.store.ChunkRecord)
83
85
  )
84
86
 
85
- row = cursor.fetchone()
86
- if row is None:
87
+ if not results:
87
88
  return None
88
89
 
89
- chunk_id, document_id, content, metadata_json = row
90
- metadata = json.loads(metadata_json) if metadata_json else {}
91
-
90
+ chunk_record = results[0]
92
91
  return Chunk(
93
- id=chunk_id, document_id=document_id, content=content, metadata=metadata
92
+ id=chunk_record.id,
93
+ document_id=chunk_record.document_id,
94
+ content=chunk_record.content,
95
+ metadata=json.loads(chunk_record.metadata) if chunk_record.metadata else {},
94
96
  )
95
97
 
96
98
  async def update(self, entity: Chunk) -> Chunk:
97
99
  """Update an existing chunk."""
98
- if self.store._connection is None:
99
- raise ValueError("Store connection is not available")
100
- if entity.id is None:
101
- raise ValueError("Chunk ID is required for update")
102
-
103
- cursor = self.store._connection.cursor()
104
- cursor.execute(
105
- """
106
- UPDATE chunks
107
- SET document_id = :document_id, content = :content, metadata = :metadata
108
- WHERE id = :id
109
- """,
110
- {
100
+ assert entity.id, "Chunk ID is required for update"
101
+
102
+ embedding = await self.embedder.embed(entity.content)
103
+
104
+ self.store.chunks_table.update(
105
+ where=f"id = '{entity.id}'",
106
+ values={
111
107
  "document_id": entity.document_id,
112
108
  "content": entity.content,
113
109
  "metadata": json.dumps(entity.metadata),
114
- "id": entity.id,
110
+ "vector": embedding,
115
111
  },
116
112
  )
113
+ # Try to optimize if not currently locked (non-blocking)
114
+ if not self._optimize_lock.locked():
115
+ asyncio.create_task(self._optimize())
117
116
 
118
- # Regenerate and update embedding
119
- embedding = await self.embedder.embed(entity.content)
120
- serialized_embedding = self.store.serialize_embedding(embedding)
121
- cursor.execute(
122
- """
123
- UPDATE chunk_embeddings
124
- SET embedding = :embedding
125
- WHERE chunk_id = :chunk_id
126
- """,
127
- {"embedding": serialized_embedding, "chunk_id": entity.id},
128
- )
129
-
130
- # Update FTS5 table
131
- cursor.execute(
132
- """
133
- UPDATE chunks_fts
134
- SET content = :content
135
- WHERE rowid = :rowid
136
- """,
137
- {"content": entity.content, "rowid": entity.id},
138
- )
139
-
140
- self.store._connection.commit()
141
117
  return entity
142
118
 
143
- async def delete(self, entity_id: int, commit: bool = True) -> bool:
119
+ async def delete(self, entity_id: str) -> bool:
144
120
  """Delete a chunk by its ID."""
145
- if self.store._connection is None:
146
- raise ValueError("Store connection is not available")
121
+ chunk = await self.get_by_id(entity_id)
122
+ if chunk is None:
123
+ return False
147
124
 
148
- cursor = self.store._connection.cursor()
149
-
150
- # Delete from FTS5 table first
151
- cursor.execute(
152
- "DELETE FROM chunks_fts WHERE rowid = :rowid", {"rowid": entity_id}
153
- )
154
-
155
- # Delete the embedding
156
- cursor.execute(
157
- "DELETE FROM chunk_embeddings WHERE chunk_id = :chunk_id",
158
- {"chunk_id": entity_id},
159
- )
160
-
161
- # Delete the chunk
162
- cursor.execute("DELETE FROM chunks WHERE id = :id", {"id": entity_id})
163
-
164
- deleted = cursor.rowcount > 0
165
- if commit:
166
- self.store._connection.commit()
167
- return deleted
125
+ self.store.chunks_table.delete(f"id = '{entity_id}'")
126
+ return True
168
127
 
169
128
  async def list_all(
170
129
  self, limit: int | None = None, offset: int | None = None
171
130
  ) -> list[Chunk]:
172
131
  """List all chunks with optional pagination."""
173
- if self.store._connection is None:
174
- raise ValueError("Store connection is not available")
175
-
176
- cursor = self.store._connection.cursor()
177
- query = "SELECT id, document_id, content, metadata FROM chunks ORDER BY document_id, id"
178
- params = {}
179
-
180
- if limit is not None:
181
- query += " LIMIT :limit"
182
- params["limit"] = limit
132
+ query = self.store.chunks_table.search()
183
133
 
184
134
  if offset is not None:
185
- query += " OFFSET :offset"
186
- params["offset"] = offset
135
+ query = query.offset(offset)
136
+ if limit is not None:
137
+ query = query.limit(limit)
187
138
 
188
- cursor.execute(query, params)
189
- rows = cursor.fetchall()
139
+ results = list(query.to_pydantic(self.store.ChunkRecord))
190
140
 
191
141
  return [
192
142
  Chunk(
193
- id=chunk_id,
194
- document_id=document_id,
195
- content=content,
196
- metadata=json.loads(metadata_json) if metadata_json else {},
143
+ id=chunk.id,
144
+ document_id=chunk.document_id,
145
+ content=chunk.content,
146
+ metadata=json.loads(chunk.metadata) if chunk.metadata else {},
197
147
  )
198
- for chunk_id, document_id, content, metadata_json in rows
148
+ for chunk in results
199
149
  ]
200
150
 
201
151
  async def create_chunks_for_document(
202
- self, document_id: int, document: DoclingDocument, commit: bool = True
152
+ self, document_id: str, document: DoclingDocument
203
153
  ) -> list[Chunk]:
204
154
  """Create chunks and embeddings for a document from DoclingDocument."""
205
- # Chunk the document content
206
155
  chunk_texts = await chunker.chunk(document)
156
+
157
+ # Generate embeddings in parallel for all chunks
158
+ embeddings_tasks = []
159
+ for chunk_text in chunk_texts:
160
+ embeddings_tasks.append(self.embedder.embed(chunk_text))
161
+
162
+ # Wait for all embeddings to complete
163
+ embeddings = await asyncio.gather(*embeddings_tasks)
164
+
165
+ # Prepare all chunk records for batch insertion
166
+ chunk_records = []
207
167
  created_chunks = []
208
168
 
209
- # Create chunks with embeddings using the create method
210
- for order, chunk_text in enumerate(chunk_texts):
211
- # Create chunk with order in metadata
169
+ for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
170
+ chunk_id = str(uuid4())
171
+
172
+ chunk_record = self.store.ChunkRecord(
173
+ id=chunk_id,
174
+ document_id=document_id,
175
+ content=chunk_text,
176
+ metadata=json.dumps({"order": order}),
177
+ vector=embedding,
178
+ )
179
+ chunk_records.append(chunk_record)
180
+
212
181
  chunk = Chunk(
213
- document_id=document_id, content=chunk_text, metadata={"order": order}
182
+ id=chunk_id,
183
+ document_id=document_id,
184
+ content=chunk_text,
185
+ metadata={"order": order},
214
186
  )
187
+ created_chunks.append(chunk)
215
188
 
216
- created_chunk = await self.create(chunk, commit=commit)
217
- created_chunks.append(created_chunk)
189
+ # Batch insert all chunks at once
190
+ if chunk_records:
191
+ self.store.chunks_table.add(chunk_records)
218
192
 
193
+ # Force optimization once at the end for bulk operations
194
+ await self._optimize()
219
195
  return created_chunks
220
196
 
221
- async def delete_all(self, commit: bool = True) -> bool:
197
+ async def delete_all(self) -> None:
222
198
  """Delete all chunks from the database."""
223
- if self.store._connection is None:
224
- raise ValueError("Store connection is not available")
225
-
226
- cursor = self.store._connection.cursor()
227
-
228
- cursor.execute("DELETE FROM chunks_fts")
229
- cursor.execute("DELETE FROM chunk_embeddings")
230
- cursor.execute("DELETE FROM chunks")
231
-
232
- deleted = cursor.rowcount > 0
233
- if commit:
234
- self.store._connection.commit()
235
- return deleted
199
+ # Drop and recreate table to clear all data
200
+ self.store.db.drop_table("chunks")
201
+ self.store.chunks_table = self.store.db.create_table(
202
+ "chunks", schema=self.store.ChunkRecord
203
+ )
204
+ # Create FTS index on the new table
205
+ self.store.chunks_table.create_fts_index("content", replace=True)
236
206
 
237
- async def delete_by_document_id(
238
- self, document_id: int, commit: bool = True
239
- ) -> bool:
207
+ async def delete_by_document_id(self, document_id: str) -> bool:
240
208
  """Delete all chunks for a document."""
241
209
  chunks = await self.get_by_document_id(document_id)
242
210
 
243
- deleted_any = False
244
- for chunk in chunks:
245
- if chunk.id is not None:
246
- deleted = await self.delete(chunk.id, commit=False)
247
- deleted_any = deleted_any or deleted
211
+ if not chunks:
212
+ return False
248
213
 
249
- if commit and deleted_any and self.store._connection:
250
- self.store._connection.commit()
251
- return deleted_any
214
+ self.store.chunks_table.delete(f"document_id = '{document_id}'")
215
+ return True
252
216
 
253
- async def search_chunks(
254
- self, query: str, limit: int = 5
217
+ async def search(
218
+ self, query: str, limit: int = 5, search_type: str = "hybrid"
255
219
  ) -> list[tuple[Chunk, float]]:
256
- """Search for relevant chunks using vector similarity."""
257
- if self.store._connection is None:
258
- raise ValueError("Store connection is not available")
259
-
260
- cursor = self.store._connection.cursor()
261
-
262
- # Generate embedding for the query
263
- query_embedding = await self.embedder.embed(query)
264
- serialized_query_embedding = self.store.serialize_embedding(query_embedding)
265
-
266
- # Search for similar chunks using sqlite-vec
267
- cursor.execute(
268
- """
269
- SELECT c.id, c.document_id, c.content, c.metadata, distance, d.uri, d.metadata as document_metadata
270
- FROM chunk_embeddings
271
- JOIN chunks c ON c.id = chunk_embeddings.chunk_id
272
- JOIN documents d ON c.document_id = d.id
273
- WHERE embedding MATCH :embedding AND k = :k
274
- ORDER BY distance
275
- """,
276
- {"embedding": serialized_query_embedding, "k": limit},
277
- )
220
+ """Search for relevant chunks using the specified search method.
278
221
 
279
- results = cursor.fetchall()
280
- return [
281
- (
282
- Chunk(
283
- id=chunk_id,
284
- document_id=document_id,
285
- content=content,
286
- metadata=json.loads(metadata_json) if metadata_json else {},
287
- document_uri=document_uri,
288
- document_meta=json.loads(document_metadata_json)
289
- if document_metadata_json
290
- else {},
291
- ),
292
- 1.0 / (1.0 + distance),
293
- )
294
- for chunk_id, document_id, content, metadata_json, distance, document_uri, document_metadata_json in results
295
- ]
222
+ Args:
223
+ query: The search query string.
224
+ limit: Maximum number of results to return.
225
+ search_type: Type of search - "vector", "fts", or "hybrid" (default).
296
226
 
297
- async def search_chunks_fts(
298
- self, query: str, limit: int = 5
299
- ) -> list[tuple[Chunk, float]]:
300
- """Search for chunks using FTS5 full-text search."""
301
- if self.store._connection is None:
302
- raise ValueError("Store connection is not available")
303
-
304
- cursor = self.store._connection.cursor()
305
-
306
- # Clean the query for FTS5 - extract keywords for better matching
307
- # Remove special characters and split into words
308
- words = re.findall(r"\b\w+\b", query.lower())
309
- # Join with OR to find chunks containing any of the keywords
310
- fts_query = " OR ".join(words) if words else query
311
-
312
- # Search using FTS5
313
- cursor.execute(
314
- """
315
- SELECT c.id, c.document_id, c.content, c.metadata, rank, d.uri, d.metadata as document_metadata
316
- FROM chunks_fts
317
- JOIN chunks c ON c.id = chunks_fts.rowid
318
- JOIN documents d ON c.document_id = d.id
319
- WHERE chunks_fts MATCH :query
320
- ORDER BY rank
321
- LIMIT :limit
322
- """,
323
- {"query": fts_query, "limit": limit},
324
- )
227
+ Returns:
228
+ List of (chunk, score) tuples ordered by relevance.
229
+ """
230
+ if not query.strip():
231
+ return []
325
232
 
326
- results = cursor.fetchall()
233
+ if search_type == "vector":
234
+ query_embedding = await self.embedder.embed(query)
327
235
 
328
- return [
329
- (
330
- Chunk(
331
- id=chunk_id,
332
- document_id=document_id,
333
- content=content,
334
- metadata=json.loads(metadata_json) if metadata_json else {},
335
- document_uri=document_uri,
336
- document_meta=json.loads(document_metadata_json)
337
- if document_metadata_json
338
- else {},
339
- ),
340
- -rank,
341
- )
342
- for chunk_id, document_id, content, metadata_json, rank, document_uri, document_metadata_json in results
343
- # FTS5 rank is negative BM25 score
344
- ]
236
+ results = self.store.chunks_table.search(
237
+ query_embedding, query_type="vector", vector_column_name="vector"
238
+ ).limit(limit)
345
239
 
346
- async def search_chunks_hybrid(
347
- self, query: str, limit: int = 5, k: int = 60
348
- ) -> list[tuple[Chunk, float]]:
349
- """Hybrid search using Reciprocal Rank Fusion (RRF) combining vector similarity and FTS5 full-text search."""
350
- if self.store._connection is None:
351
- raise ValueError("Store connection is not available")
352
-
353
- cursor = self.store._connection.cursor()
354
-
355
- # Generate embedding for the query
356
- query_embedding = await self.embedder.embed(query)
357
- serialized_query_embedding = self.store.serialize_embedding(query_embedding)
358
-
359
- # Clean the query for FTS5 - extract keywords for better matching
360
- # Remove special characters and split into words
361
- words = re.findall(r"\b\w+\b", query.lower())
362
- # Join with OR to find chunks containing any of the keywords
363
- fts_query = " OR ".join(words) if words else query
364
- # Perform hybrid search using RRF (Reciprocal Rank Fusion)
365
- cursor.execute(
366
- """
367
- WITH vector_search AS (
368
- SELECT
369
- c.id,
370
- c.document_id,
371
- c.content,
372
- c.metadata,
373
- ROW_NUMBER() OVER (ORDER BY ce.distance) as vector_rank
374
- FROM chunk_embeddings ce
375
- JOIN chunks c ON c.id = ce.chunk_id
376
- WHERE ce.embedding MATCH :embedding AND k = :k_vector
377
- ORDER BY ce.distance
378
- ),
379
- fts_search AS (
380
- SELECT
381
- c.id,
382
- c.document_id,
383
- c.content,
384
- c.metadata,
385
- ROW_NUMBER() OVER (ORDER BY chunks_fts.rank) as fts_rank
386
- FROM chunks_fts
387
- JOIN chunks c ON c.id = chunks_fts.rowid
388
- WHERE chunks_fts MATCH :fts_query
389
- ORDER BY chunks_fts.rank
390
- ),
391
- all_chunks AS (
392
- SELECT id, document_id, content, metadata FROM vector_search
393
- UNION
394
- SELECT id, document_id, content, metadata FROM fts_search
395
- ),
396
- rrf_scores AS (
397
- SELECT
398
- a.id,
399
- a.document_id,
400
- a.content,
401
- a.metadata,
402
- COALESCE(1.0 / (:k + v.vector_rank), 0) + COALESCE(1.0 / (:k + f.fts_rank), 0) as rrf_score
403
- FROM all_chunks a
404
- LEFT JOIN vector_search v ON a.id = v.id
405
- LEFT JOIN fts_search f ON a.id = f.id
240
+ return await self._process_search_results(results)
241
+
242
+ elif search_type == "fts":
243
+ results = self.store.chunks_table.search(query, query_type="fts").limit(
244
+ limit
406
245
  )
407
- SELECT r.id, r.document_id, r.content, r.metadata, r.rrf_score, d.uri, d.metadata as document_metadata
408
- FROM rrf_scores r
409
- JOIN documents d ON r.document_id = d.id
410
- ORDER BY r.rrf_score DESC
411
- LIMIT :limit
412
- """,
413
- {
414
- "embedding": serialized_query_embedding,
415
- "k_vector": limit * 3,
416
- "fts_query": fts_query,
417
- "k": k,
418
- "limit": limit,
419
- },
420
- )
246
+ return await self._process_search_results(results)
421
247
 
422
- results = cursor.fetchall()
423
- return [
424
- (
425
- Chunk(
426
- id=chunk_id,
427
- document_id=document_id,
428
- content=content,
429
- metadata=json.loads(metadata_json) if metadata_json else {},
430
- document_uri=document_uri,
431
- document_meta=json.loads(document_metadata_json)
432
- if document_metadata_json
433
- else {},
434
- ),
435
- rrf_score,
248
+ else: # hybrid (default)
249
+ query_embedding = await self.embedder.embed(query)
250
+
251
+ # Create RRF reranker
252
+ reranker = RRFReranker()
253
+
254
+ # Perform native hybrid search with RRF reranking
255
+ results = (
256
+ self.store.chunks_table.search(query_type="hybrid")
257
+ .vector(query_embedding)
258
+ .text(query)
259
+ .rerank(reranker)
260
+ .limit(limit)
436
261
  )
437
- for chunk_id, document_id, content, metadata_json, rrf_score, document_uri, document_metadata_json in results
438
- ]
262
+ return await self._process_search_results(results)
439
263
 
440
- async def get_by_document_id(self, document_id: int) -> list[Chunk]:
264
+ async def get_by_document_id(self, document_id: str) -> list[Chunk]:
441
265
  """Get all chunks for a specific document."""
442
- if self.store._connection is None:
443
- raise ValueError("Store connection is not available")
444
-
445
- cursor = self.store._connection.cursor()
446
- cursor.execute(
447
- """
448
- SELECT c.id, c.document_id, c.content, c.metadata, d.uri, d.metadata as document_metadata
449
- FROM chunks c
450
- JOIN documents d ON c.document_id = d.id
451
- WHERE c.document_id = :document_id
452
- ORDER BY JSON_EXTRACT(c.metadata, '$.order')
453
- """,
454
- {"document_id": document_id},
266
+ results = list(
267
+ self.store.chunks_table.search()
268
+ .where(f"document_id = '{document_id}'")
269
+ .to_pydantic(self.store.ChunkRecord)
455
270
  )
456
271
 
457
- rows = cursor.fetchall()
458
- return [
272
+ # Get document info
273
+ doc_results = list(
274
+ self.store.documents_table.search()
275
+ .where(f"id = '{document_id}'")
276
+ .limit(1)
277
+ .to_pydantic(DocumentRecord)
278
+ )
279
+
280
+ doc_uri = doc_results[0].uri if doc_results else None
281
+ doc_meta = doc_results[0].metadata if doc_results else "{}"
282
+
283
+ # Sort by order in metadata
284
+ chunks = [
459
285
  Chunk(
460
- id=chunk_id,
461
- document_id=document_id,
462
- content=content,
463
- metadata=json.loads(metadata_json) if metadata_json else {},
464
- document_uri=document_uri,
465
- document_meta=json.loads(document_metadata_json)
466
- if document_metadata_json
467
- else {},
286
+ id=chunk.id,
287
+ document_id=chunk.document_id,
288
+ content=chunk.content,
289
+ metadata=json.loads(chunk.metadata) if chunk.metadata else {},
290
+ document_uri=doc_uri,
291
+ document_meta=json.loads(doc_meta) if doc_meta else {},
468
292
  )
469
- for chunk_id, document_id, content, metadata_json, document_uri, document_metadata_json in rows
293
+ for chunk in results
470
294
  ]
471
295
 
296
+ chunks.sort(key=lambda c: c.metadata.get("order", 0))
297
+ return chunks
298
+
472
299
  async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
473
300
  """Get adjacent chunks before and after the given chunk within the same document."""
474
- if self.store._connection is None:
475
- raise ValueError("Store connection is not available")
476
- if chunk.document_id is None:
477
- return []
301
+ assert chunk.document_id, "Document id is required for adjacent chunk finding"
478
302
 
479
- cursor = self.store._connection.cursor()
480
303
  chunk_order = chunk.metadata.get("order")
481
304
  if chunk_order is None:
482
305
  return []
483
306
 
484
- # Get adjacent chunks within the same document
485
- cursor.execute(
486
- """
487
- SELECT c.id, c.document_id, c.content, c.metadata, d.uri, d.metadata as document_metadata
488
- FROM chunks c
489
- JOIN documents d ON c.document_id = d.id
490
- WHERE c.document_id = :document_id
491
- AND JSON_EXTRACT(c.metadata, '$.order') BETWEEN :start_order AND :end_order
492
- AND c.id != :chunk_id
493
- ORDER BY JSON_EXTRACT(c.metadata, '$.order')
494
- """,
495
- {
496
- "document_id": chunk.document_id,
497
- "start_order": max(0, chunk_order - num_adjacent),
498
- "end_order": chunk_order + num_adjacent,
499
- "chunk_id": chunk.id,
500
- },
501
- )
307
+ # Get all chunks for the document
308
+ all_chunks = await self.get_by_document_id(chunk.document_id)
309
+
310
+ # Filter to adjacent chunks
311
+ adjacent_chunks = []
312
+ for c in all_chunks:
313
+ c_order = c.metadata.get("order", 0)
314
+ if c.id != chunk.id and abs(c_order - chunk_order) <= num_adjacent:
315
+ adjacent_chunks.append(c)
316
+
317
+ return adjacent_chunks
318
+
319
+ async def _process_search_results(self, query_result) -> list[tuple[Chunk, float]]:
320
+ """Process search results into chunks with document info and scores."""
321
+ chunks_with_scores = []
322
+
323
+ # Get both arrow and pydantic results to access scores
324
+ arrow_result = query_result.to_arrow()
325
+ pydantic_results = list(query_result.to_pydantic(self.store.ChunkRecord))
326
+
327
+ # Extract scores from arrow result based on search type
328
+ scores = []
329
+ column_names = arrow_result.column_names
330
+
331
+ if "_distance" in column_names:
332
+ # Vector search - distance (lower is better, convert to similarity)
333
+ distances = arrow_result.column("_distance").to_pylist()
334
+ scores = [max(0.0, 1.0 / (1.0 + dist)) for dist in distances]
335
+ elif "_relevance_score" in column_names:
336
+ # Hybrid search - relevance score (higher is better)
337
+ scores = arrow_result.column("_relevance_score").to_pylist()
338
+ elif "_score" in column_names:
339
+ # FTS search - score (higher is better)
340
+ scores = arrow_result.column("_score").to_pylist()
341
+ else:
342
+ raise ValueError("Unknown search result format, cannot extract scores")
343
+
344
+ # Collect all unique document IDs for batch lookup
345
+ document_ids = list(set(chunk.document_id for chunk in pydantic_results))
346
+
347
+ # Batch fetch all documents at once
348
+ documents_map = {}
349
+ if document_ids:
350
+ # Create a WHERE clause for all document IDs
351
+ where_clause = " OR ".join(f"id = '{doc_id}'" for doc_id in document_ids)
352
+ doc_results = list(
353
+ self.store.documents_table.search()
354
+ .where(where_clause)
355
+ .to_pydantic(DocumentRecord)
356
+ )
357
+ documents_map = {doc.id: doc for doc in doc_results}
502
358
 
503
- rows = cursor.fetchall()
504
- return [
505
- Chunk(
506
- id=chunk_id,
507
- document_id=document_id,
508
- content=content,
509
- metadata=json.loads(metadata_json) if metadata_json else {},
510
- document_uri=document_uri,
511
- document_meta=json.loads(document_metadata_json)
512
- if document_metadata_json
359
+ for i, chunk_record in enumerate(pydantic_results):
360
+ # Get document info from pre-fetched map
361
+ doc = documents_map.get(chunk_record.document_id)
362
+ doc_uri = doc.uri if doc else None
363
+ doc_meta = doc.metadata if doc else "{}"
364
+
365
+ chunk = Chunk(
366
+ id=chunk_record.id,
367
+ document_id=chunk_record.document_id,
368
+ content=chunk_record.content,
369
+ metadata=json.loads(chunk_record.metadata)
370
+ if chunk_record.metadata
513
371
  else {},
372
+ document_uri=doc_uri,
373
+ document_meta=json.loads(doc_meta) if doc_meta else {},
514
374
  )
515
- for chunk_id, document_id, content, metadata_json, document_uri, document_metadata_json in rows
516
- ]
375
+
376
+ # Get score from arrow result
377
+ score = scores[i] if i < len(scores) else 1.0
378
+
379
+ chunks_with_scores.append((chunk, score))
380
+
381
+ return chunks_with_scores