haiku.rag 0.9.2__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. README.md +205 -0
  2. haiku_rag-0.14.0.dist-info/METADATA +227 -0
  3. haiku_rag-0.14.0.dist-info/RECORD +6 -0
  4. haiku/rag/__init__.py +0 -0
  5. haiku/rag/app.py +0 -267
  6. haiku/rag/chunker.py +0 -51
  7. haiku/rag/cli.py +0 -359
  8. haiku/rag/client.py +0 -565
  9. haiku/rag/config.py +0 -77
  10. haiku/rag/embeddings/__init__.py +0 -35
  11. haiku/rag/embeddings/base.py +0 -15
  12. haiku/rag/embeddings/ollama.py +0 -17
  13. haiku/rag/embeddings/openai.py +0 -16
  14. haiku/rag/embeddings/vllm.py +0 -19
  15. haiku/rag/embeddings/voyageai.py +0 -17
  16. haiku/rag/logging.py +0 -56
  17. haiku/rag/mcp.py +0 -144
  18. haiku/rag/migration.py +0 -316
  19. haiku/rag/monitor.py +0 -73
  20. haiku/rag/qa/__init__.py +0 -15
  21. haiku/rag/qa/agent.py +0 -89
  22. haiku/rag/qa/prompts.py +0 -60
  23. haiku/rag/reader.py +0 -115
  24. haiku/rag/reranking/__init__.py +0 -34
  25. haiku/rag/reranking/base.py +0 -13
  26. haiku/rag/reranking/cohere.py +0 -34
  27. haiku/rag/reranking/mxbai.py +0 -28
  28. haiku/rag/reranking/vllm.py +0 -44
  29. haiku/rag/research/__init__.py +0 -37
  30. haiku/rag/research/base.py +0 -130
  31. haiku/rag/research/dependencies.py +0 -45
  32. haiku/rag/research/evaluation_agent.py +0 -42
  33. haiku/rag/research/orchestrator.py +0 -300
  34. haiku/rag/research/presearch_agent.py +0 -34
  35. haiku/rag/research/prompts.py +0 -129
  36. haiku/rag/research/search_agent.py +0 -65
  37. haiku/rag/research/synthesis_agent.py +0 -40
  38. haiku/rag/store/__init__.py +0 -4
  39. haiku/rag/store/engine.py +0 -230
  40. haiku/rag/store/models/__init__.py +0 -4
  41. haiku/rag/store/models/chunk.py +0 -15
  42. haiku/rag/store/models/document.py +0 -16
  43. haiku/rag/store/repositories/__init__.py +0 -9
  44. haiku/rag/store/repositories/chunk.py +0 -399
  45. haiku/rag/store/repositories/document.py +0 -234
  46. haiku/rag/store/repositories/settings.py +0 -148
  47. haiku/rag/store/upgrades/__init__.py +0 -1
  48. haiku/rag/utils.py +0 -162
  49. haiku_rag-0.9.2.dist-info/METADATA +0 -131
  50. haiku_rag-0.9.2.dist-info/RECORD +0 -50
  51. {haiku_rag-0.9.2.dist-info → haiku_rag-0.14.0.dist-info}/WHEEL +0 -0
  52. {haiku_rag-0.9.2.dist-info → haiku_rag-0.14.0.dist-info}/entry_points.txt +0 -0
  53. {haiku_rag-0.9.2.dist-info → haiku_rag-0.14.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/store/engine.py DELETED
@@ -1,230 +0,0 @@
1
- import json
2
- import logging
3
- from datetime import timedelta
4
- from importlib import metadata
5
- from pathlib import Path
6
- from uuid import uuid4
7
-
8
- import lancedb
9
- from lancedb.pydantic import LanceModel, Vector
10
- from pydantic import Field
11
-
12
- from haiku.rag.config import Config
13
- from haiku.rag.embeddings import get_embedder
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class DocumentRecord(LanceModel):
19
- id: str = Field(default_factory=lambda: str(uuid4()))
20
- content: str
21
- uri: str | None = None
22
- metadata: str = Field(default="{}")
23
- created_at: str = Field(default_factory=lambda: "")
24
- updated_at: str = Field(default_factory=lambda: "")
25
-
26
-
27
- def create_chunk_model(vector_dim: int):
28
- """Create a ChunkRecord model with the specified vector dimension.
29
-
30
- This creates a model with proper vector typing for LanceDB.
31
- """
32
-
33
- class ChunkRecord(LanceModel):
34
- id: str = Field(default_factory=lambda: str(uuid4()))
35
- document_id: str
36
- content: str
37
- metadata: str = Field(default="{}")
38
- vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
39
-
40
- return ChunkRecord
41
-
42
-
43
- class SettingsRecord(LanceModel):
44
- id: str = Field(default="settings")
45
- settings: str = Field(default="{}")
46
-
47
-
48
- class Store:
49
- def __init__(self, db_path: Path, skip_validation: bool = False):
50
- self.db_path: Path = db_path
51
- self.embedder = get_embedder()
52
-
53
- # Create the ChunkRecord model with the correct vector dimension
54
- self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
55
-
56
- # Connect to LanceDB
57
- self.db = self._connect_to_lancedb(db_path)
58
-
59
- # Initialize tables
60
- self.create_or_update_db()
61
-
62
- # Validate config compatibility after connection is established
63
- if not skip_validation:
64
- self._validate_configuration()
65
-
66
- def vacuum(self) -> None:
67
- """Optimize and clean up old versions across all tables to reduce disk usage."""
68
- if self._has_cloud_config() and str(Config.LANCEDB_URI).startswith("db://"):
69
- return
70
-
71
- # Perform maintenance per table using optimize() with cleanup_older_than 0
72
- for table in [self.documents_table, self.chunks_table, self.settings_table]:
73
- table.optimize(cleanup_older_than=timedelta(0))
74
-
75
- def _connect_to_lancedb(self, db_path: Path):
76
- """Establish connection to LanceDB (local, cloud, or object storage)."""
77
- # Check if we have cloud configuration
78
- if self._has_cloud_config():
79
- return lancedb.connect(
80
- uri=Config.LANCEDB_URI,
81
- api_key=Config.LANCEDB_API_KEY,
82
- region=Config.LANCEDB_REGION,
83
- )
84
- else:
85
- # Local file system connection
86
- return lancedb.connect(db_path)
87
-
88
- def _has_cloud_config(self) -> bool:
89
- """Check if cloud configuration is complete."""
90
- return bool(
91
- Config.LANCEDB_URI and Config.LANCEDB_API_KEY and Config.LANCEDB_REGION
92
- )
93
-
94
- def _validate_configuration(self) -> None:
95
- """Validate that the configuration is compatible with the database."""
96
- from haiku.rag.store.repositories.settings import SettingsRepository
97
-
98
- settings_repo = SettingsRepository(self)
99
- settings_repo.validate_config_compatibility()
100
-
101
- def create_or_update_db(self):
102
- """Create the database tables."""
103
-
104
- # Get list of existing tables
105
- existing_tables = self.db.table_names()
106
-
107
- # Create or get documents table
108
- if "documents" in existing_tables:
109
- self.documents_table = self.db.open_table("documents")
110
- else:
111
- self.documents_table = self.db.create_table(
112
- "documents", schema=DocumentRecord
113
- )
114
-
115
- # Create or get chunks table
116
- if "chunks" in existing_tables:
117
- self.chunks_table = self.db.open_table("chunks")
118
- else:
119
- self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
120
- # Create FTS index on the new table
121
- self.chunks_table.create_fts_index("content", replace=True)
122
-
123
- # Create or get settings table
124
- if "settings" in existing_tables:
125
- self.settings_table = self.db.open_table("settings")
126
- else:
127
- self.settings_table = self.db.create_table(
128
- "settings", schema=SettingsRecord
129
- )
130
- # Save current settings to the new database
131
- settings_data = Config.model_dump(mode="json")
132
- self.settings_table.add(
133
- [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
134
- )
135
-
136
- # Set current version in settings
137
- current_version = metadata.version("haiku.rag")
138
- self.set_haiku_version(current_version)
139
-
140
- # Check if we need to perform upgrades
141
- try:
142
- existing_settings = list(
143
- self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
144
- )
145
- if existing_settings:
146
- db_version = self.get_haiku_version() # noqa: F841
147
- # TODO: Add upgrade logic here similar to SQLite version when needed
148
- except Exception:
149
- # Settings table might not exist yet in fresh databases
150
- pass
151
-
152
- def get_haiku_version(self) -> str:
153
- """Returns the user version stored in settings."""
154
- settings_records = list(
155
- self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
156
- )
157
- if settings_records:
158
- settings = (
159
- json.loads(settings_records[0].settings)
160
- if settings_records[0].settings
161
- else {}
162
- )
163
- return settings.get("version", "0.0.0")
164
- return "0.0.0"
165
-
166
- def set_haiku_version(self, version: str) -> None:
167
- """Updates the user version in settings."""
168
- settings_records = list(
169
- self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
170
- )
171
- if settings_records:
172
- # Only write if version actually changes to avoid creating new table versions
173
- current = (
174
- json.loads(settings_records[0].settings)
175
- if settings_records[0].settings
176
- else {}
177
- )
178
- if current.get("version") != version:
179
- current["version"] = version
180
- self.settings_table.update(
181
- where="id = 'settings'",
182
- values={"settings": json.dumps(current)},
183
- )
184
- else:
185
- # Create new settings record
186
- settings_data = Config.model_dump(mode="json")
187
- settings_data["version"] = version
188
- self.settings_table.add(
189
- [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
190
- )
191
-
192
- def recreate_embeddings_table(self) -> None:
193
- """Recreate the chunks table with current vector dimensions."""
194
- # Drop and recreate chunks table
195
- try:
196
- self.db.drop_table("chunks")
197
- except Exception:
198
- pass
199
-
200
- # Update the ChunkRecord model with new vector dimension
201
- self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
202
- self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
203
-
204
- # Create FTS index on the new table
205
- self.chunks_table.create_fts_index("content", replace=True)
206
-
207
- def close(self):
208
- """Close the database connection."""
209
- # LanceDB connections are automatically managed
210
- pass
211
-
212
- def current_table_versions(self) -> dict[str, int]:
213
- """Capture current versions of key tables for rollback using LanceDB's API."""
214
- return {
215
- "documents": int(self.documents_table.version),
216
- "chunks": int(self.chunks_table.version),
217
- "settings": int(self.settings_table.version),
218
- }
219
-
220
- def restore_table_versions(self, versions: dict[str, int]) -> bool:
221
- """Restore tables to the provided versions using LanceDB's API."""
222
- self.documents_table.restore(int(versions["documents"]))
223
- self.chunks_table.restore(int(versions["chunks"]))
224
- self.settings_table.restore(int(versions["settings"]))
225
- return True
226
-
227
- @property
228
- def _connection(self):
229
- """Compatibility property for repositories expecting _connection."""
230
- return self
@@ -1,4 +0,0 @@
1
- from .chunk import Chunk
2
- from .document import Document
3
-
4
- __all__ = ["Chunk", "Document"]
@@ -1,15 +0,0 @@
1
- from pydantic import BaseModel
2
-
3
-
4
- class Chunk(BaseModel):
5
- """
6
- Represents a chunk with content, metadata, and optional document information.
7
- """
8
-
9
- id: str | None = None
10
- document_id: str | None = None
11
- content: str
12
- metadata: dict = {}
13
- document_uri: str | None = None
14
- document_meta: dict = {}
15
- embedding: list[float] | None = None
@@ -1,16 +0,0 @@
1
- from datetime import datetime
2
-
3
- from pydantic import BaseModel, Field
4
-
5
-
6
- class Document(BaseModel):
7
- """
8
- Represents a document with an ID, content, and metadata.
9
- """
10
-
11
- id: str | None = None
12
- content: str
13
- uri: str | None = None
14
- metadata: dict = {}
15
- created_at: datetime = Field(default_factory=datetime.now)
16
- updated_at: datetime = Field(default_factory=datetime.now)
@@ -1,9 +0,0 @@
1
- from haiku.rag.store.repositories.chunk import ChunkRepository
2
- from haiku.rag.store.repositories.document import DocumentRepository
3
- from haiku.rag.store.repositories.settings import SettingsRepository
4
-
5
- __all__ = [
6
- "ChunkRepository",
7
- "DocumentRepository",
8
- "SettingsRepository",
9
- ]
@@ -1,399 +0,0 @@
1
- import asyncio
2
- import inspect
3
- import json
4
- import logging
5
- from uuid import uuid4
6
-
7
- from docling_core.types.doc.document import DoclingDocument
8
- from lancedb.rerankers import RRFReranker
9
-
10
- from haiku.rag.chunker import chunker
11
- from haiku.rag.config import Config
12
- from haiku.rag.embeddings import get_embedder
13
- from haiku.rag.store.engine import DocumentRecord, Store
14
- from haiku.rag.store.models.chunk import Chunk
15
- from haiku.rag.utils import load_callable, text_to_docling_document
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- class ChunkRepository:
21
- """Repository for Chunk operations."""
22
-
23
- def __init__(self, store: Store) -> None:
24
- self.store = store
25
- self.embedder = get_embedder()
26
- self._optimize_lock = asyncio.Lock()
27
-
28
- def _ensure_fts_index(self) -> None:
29
- """Ensure FTS index exists on the content column."""
30
- try:
31
- self.store.chunks_table.create_fts_index("content", replace=True)
32
- except Exception as e:
33
- # Log the error but don't fail - FTS might already exist
34
- logger.debug(f"FTS index creation skipped: {e}")
35
-
36
- async def _optimize(self) -> None:
37
- """Optimize the chunks table to refresh indexes."""
38
- # Skip optimization for LanceDB Cloud as it handles this automatically
39
- if Config.LANCEDB_URI and Config.LANCEDB_URI.startswith("db://"):
40
- return
41
-
42
- async with self._optimize_lock:
43
- try:
44
- self.store.chunks_table.optimize()
45
- except (RuntimeError, OSError) as e:
46
- # Handle "too many open files" and other resource errors gracefully
47
- logger.debug(
48
- f"Table optimization skipped due to resource constraints: {e}"
49
- )
50
-
51
- async def create(self, entity: Chunk) -> Chunk:
52
- """Create a chunk in the database."""
53
- assert entity.document_id, "Chunk must have a document_id to be created"
54
-
55
- chunk_id = str(uuid4())
56
-
57
- # Generate embedding if not provided
58
- if entity.embedding is not None:
59
- embedding = entity.embedding
60
- else:
61
- embedding = await self.embedder.embed(entity.content)
62
- chunk_record = self.store.ChunkRecord(
63
- id=chunk_id,
64
- document_id=entity.document_id,
65
- content=entity.content,
66
- metadata=json.dumps(entity.metadata),
67
- vector=embedding,
68
- )
69
-
70
- self.store.chunks_table.add([chunk_record])
71
-
72
- entity.id = chunk_id
73
-
74
- # Try to optimize if not currently locked (non-blocking)
75
- if not self._optimize_lock.locked():
76
- asyncio.create_task(self._optimize())
77
-
78
- return entity
79
-
80
- async def get_by_id(self, entity_id: str) -> Chunk | None:
81
- """Get a chunk by its ID."""
82
- results = list(
83
- self.store.chunks_table.search()
84
- .where(f"id = '{entity_id}'")
85
- .limit(1)
86
- .to_pydantic(self.store.ChunkRecord)
87
- )
88
-
89
- if not results:
90
- return None
91
-
92
- chunk_record = results[0]
93
- return Chunk(
94
- id=chunk_record.id,
95
- document_id=chunk_record.document_id,
96
- content=chunk_record.content,
97
- metadata=json.loads(chunk_record.metadata) if chunk_record.metadata else {},
98
- )
99
-
100
- async def update(self, entity: Chunk) -> Chunk:
101
- """Update an existing chunk."""
102
- assert entity.id, "Chunk ID is required for update"
103
-
104
- embedding = await self.embedder.embed(entity.content)
105
-
106
- self.store.chunks_table.update(
107
- where=f"id = '{entity.id}'",
108
- values={
109
- "document_id": entity.document_id,
110
- "content": entity.content,
111
- "metadata": json.dumps(entity.metadata),
112
- "vector": embedding,
113
- },
114
- )
115
- # Try to optimize if not currently locked (non-blocking)
116
- if not self._optimize_lock.locked():
117
- asyncio.create_task(self._optimize())
118
-
119
- return entity
120
-
121
- async def delete(self, entity_id: str) -> bool:
122
- """Delete a chunk by its ID."""
123
- chunk = await self.get_by_id(entity_id)
124
- if chunk is None:
125
- return False
126
-
127
- self.store.chunks_table.delete(f"id = '{entity_id}'")
128
- return True
129
-
130
- async def list_all(
131
- self, limit: int | None = None, offset: int | None = None
132
- ) -> list[Chunk]:
133
- """List all chunks with optional pagination."""
134
- query = self.store.chunks_table.search()
135
-
136
- if offset is not None:
137
- query = query.offset(offset)
138
- if limit is not None:
139
- query = query.limit(limit)
140
-
141
- results = list(query.to_pydantic(self.store.ChunkRecord))
142
-
143
- return [
144
- Chunk(
145
- id=chunk.id,
146
- document_id=chunk.document_id,
147
- content=chunk.content,
148
- metadata=json.loads(chunk.metadata) if chunk.metadata else {},
149
- )
150
- for chunk in results
151
- ]
152
-
153
- async def create_chunks_for_document(
154
- self, document_id: str, document: DoclingDocument
155
- ) -> list[Chunk]:
156
- """Create chunks and embeddings for a document from DoclingDocument."""
157
- # Optionally preprocess markdown before chunking
158
- processed_document = document
159
- preprocessor_path = Config.MARKDOWN_PREPROCESSOR
160
- if preprocessor_path:
161
- try:
162
- pre_fn = load_callable(preprocessor_path)
163
- markdown = document.export_to_markdown()
164
- result = pre_fn(markdown)
165
- if inspect.isawaitable(result):
166
- result = await result # type: ignore[assignment]
167
- processed_markdown = result
168
- if not isinstance(processed_markdown, str):
169
- raise ValueError("Preprocessor must return a markdown string")
170
- processed_document = text_to_docling_document(
171
- processed_markdown, name="content.md"
172
- )
173
- except Exception as e:
174
- logger.error(
175
- f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
176
- )
177
- raise e
178
-
179
- chunk_texts = await chunker.chunk(processed_document)
180
-
181
- embeddings = await self.embedder.embed(chunk_texts)
182
-
183
- # Prepare all chunk records for batch insertion
184
- chunk_records = []
185
- created_chunks = []
186
-
187
- for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
188
- chunk_id = str(uuid4())
189
-
190
- chunk_record = self.store.ChunkRecord(
191
- id=chunk_id,
192
- document_id=document_id,
193
- content=chunk_text,
194
- metadata=json.dumps({"order": order}),
195
- vector=embedding,
196
- )
197
- chunk_records.append(chunk_record)
198
-
199
- chunk = Chunk(
200
- id=chunk_id,
201
- document_id=document_id,
202
- content=chunk_text,
203
- metadata={"order": order},
204
- )
205
- created_chunks.append(chunk)
206
-
207
- # Batch insert all chunks at once
208
- if chunk_records:
209
- self.store.chunks_table.add(chunk_records)
210
-
211
- # Force optimization once at the end for bulk operations
212
- await self._optimize()
213
- return created_chunks
214
-
215
- async def delete_all(self) -> None:
216
- """Delete all chunks from the database."""
217
- # Drop and recreate table to clear all data
218
- self.store.db.drop_table("chunks")
219
- self.store.chunks_table = self.store.db.create_table(
220
- "chunks", schema=self.store.ChunkRecord
221
- )
222
- # Create FTS index on the new table
223
- self.store.chunks_table.create_fts_index("content", replace=True)
224
-
225
- async def delete_by_document_id(self, document_id: str) -> bool:
226
- """Delete all chunks for a document."""
227
- chunks = await self.get_by_document_id(document_id)
228
-
229
- if not chunks:
230
- return False
231
-
232
- self.store.chunks_table.delete(f"document_id = '{document_id}'")
233
- return True
234
-
235
- async def search(
236
- self, query: str, limit: int = 5, search_type: str = "hybrid"
237
- ) -> list[tuple[Chunk, float]]:
238
- """Search for relevant chunks using the specified search method.
239
-
240
- Args:
241
- query: The search query string.
242
- limit: Maximum number of results to return.
243
- search_type: Type of search - "vector", "fts", or "hybrid" (default).
244
-
245
- Returns:
246
- List of (chunk, score) tuples ordered by relevance.
247
- """
248
- if not query.strip():
249
- return []
250
-
251
- if search_type == "vector":
252
- query_embedding = await self.embedder.embed(query)
253
-
254
- results = self.store.chunks_table.search(
255
- query_embedding, query_type="vector", vector_column_name="vector"
256
- ).limit(limit)
257
-
258
- return await self._process_search_results(results)
259
-
260
- elif search_type == "fts":
261
- results = self.store.chunks_table.search(query, query_type="fts").limit(
262
- limit
263
- )
264
- return await self._process_search_results(results)
265
-
266
- else: # hybrid (default)
267
- query_embedding = await self.embedder.embed(query)
268
-
269
- # Create RRF reranker
270
- reranker = RRFReranker()
271
-
272
- # Perform native hybrid search with RRF reranking
273
- results = (
274
- self.store.chunks_table.search(query_type="hybrid")
275
- .vector(query_embedding)
276
- .text(query)
277
- .rerank(reranker)
278
- .limit(limit)
279
- )
280
- return await self._process_search_results(results)
281
-
282
- async def get_by_document_id(self, document_id: str) -> list[Chunk]:
283
- """Get all chunks for a specific document."""
284
- results = list(
285
- self.store.chunks_table.search()
286
- .where(f"document_id = '{document_id}'")
287
- .to_pydantic(self.store.ChunkRecord)
288
- )
289
-
290
- # Get document info
291
- doc_results = list(
292
- self.store.documents_table.search()
293
- .where(f"id = '{document_id}'")
294
- .limit(1)
295
- .to_pydantic(DocumentRecord)
296
- )
297
-
298
- doc_uri = doc_results[0].uri if doc_results else None
299
- doc_meta = doc_results[0].metadata if doc_results else "{}"
300
-
301
- # Sort by order in metadata
302
- chunks = [
303
- Chunk(
304
- id=chunk.id,
305
- document_id=chunk.document_id,
306
- content=chunk.content,
307
- metadata=json.loads(chunk.metadata) if chunk.metadata else {},
308
- document_uri=doc_uri,
309
- document_meta=json.loads(doc_meta) if doc_meta else {},
310
- )
311
- for chunk in results
312
- ]
313
-
314
- chunks.sort(key=lambda c: c.metadata.get("order", 0))
315
- return chunks
316
-
317
- async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
318
- """Get adjacent chunks before and after the given chunk within the same document."""
319
- assert chunk.document_id, "Document id is required for adjacent chunk finding"
320
-
321
- chunk_order = chunk.metadata.get("order")
322
- if chunk_order is None:
323
- return []
324
-
325
- # Get all chunks for the document
326
- all_chunks = await self.get_by_document_id(chunk.document_id)
327
-
328
- # Filter to adjacent chunks
329
- adjacent_chunks = []
330
- for c in all_chunks:
331
- c_order = c.metadata.get("order", 0)
332
- if c.id != chunk.id and abs(c_order - chunk_order) <= num_adjacent:
333
- adjacent_chunks.append(c)
334
-
335
- return adjacent_chunks
336
-
337
- async def _process_search_results(self, query_result) -> list[tuple[Chunk, float]]:
338
- """Process search results into chunks with document info and scores."""
339
- chunks_with_scores = []
340
-
341
- # Get both arrow and pydantic results to access scores
342
- arrow_result = query_result.to_arrow()
343
- pydantic_results = list(query_result.to_pydantic(self.store.ChunkRecord))
344
-
345
- # Extract scores from arrow result based on search type
346
- scores = []
347
- column_names = arrow_result.column_names
348
-
349
- if "_distance" in column_names:
350
- # Vector search - distance (lower is better, convert to similarity)
351
- distances = arrow_result.column("_distance").to_pylist()
352
- scores = [max(0.0, 1.0 / (1.0 + dist)) for dist in distances]
353
- elif "_relevance_score" in column_names:
354
- # Hybrid search - relevance score (higher is better)
355
- scores = arrow_result.column("_relevance_score").to_pylist()
356
- elif "_score" in column_names:
357
- # FTS search - score (higher is better)
358
- scores = arrow_result.column("_score").to_pylist()
359
- else:
360
- raise ValueError("Unknown search result format, cannot extract scores")
361
-
362
- # Collect all unique document IDs for batch lookup
363
- document_ids = list(set(chunk.document_id for chunk in pydantic_results))
364
-
365
- # Batch fetch all documents at once
366
- documents_map = {}
367
- if document_ids:
368
- # Create a WHERE clause for all document IDs
369
- where_clause = " OR ".join(f"id = '{doc_id}'" for doc_id in document_ids)
370
- doc_results = list(
371
- self.store.documents_table.search()
372
- .where(where_clause)
373
- .to_pydantic(DocumentRecord)
374
- )
375
- documents_map = {doc.id: doc for doc in doc_results}
376
-
377
- for i, chunk_record in enumerate(pydantic_results):
378
- # Get document info from pre-fetched map
379
- doc = documents_map.get(chunk_record.document_id)
380
- doc_uri = doc.uri if doc else None
381
- doc_meta = doc.metadata if doc else "{}"
382
-
383
- chunk = Chunk(
384
- id=chunk_record.id,
385
- document_id=chunk_record.document_id,
386
- content=chunk_record.content,
387
- metadata=json.loads(chunk_record.metadata)
388
- if chunk_record.metadata
389
- else {},
390
- document_uri=doc_uri,
391
- document_meta=json.loads(doc_meta) if doc_meta else {},
392
- )
393
-
394
- # Get score from arrow result
395
- score = scores[i] if i < len(scores) else 1.0
396
-
397
- chunks_with_scores.append((chunk, score))
398
-
399
- return chunks_with_scores