haiku.rag 0.10.2__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +205 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/METADATA +100 -41
- haiku_rag-0.14.0.dist-info/RECORD +6 -0
- haiku/rag/__init__.py +0 -0
- haiku/rag/app.py +0 -437
- haiku/rag/chunker.py +0 -51
- haiku/rag/cli.py +0 -466
- haiku/rag/client.py +0 -605
- haiku/rag/config.py +0 -81
- haiku/rag/embeddings/__init__.py +0 -35
- haiku/rag/embeddings/base.py +0 -15
- haiku/rag/embeddings/ollama.py +0 -17
- haiku/rag/embeddings/openai.py +0 -16
- haiku/rag/embeddings/vllm.py +0 -19
- haiku/rag/embeddings/voyageai.py +0 -17
- haiku/rag/logging.py +0 -56
- haiku/rag/mcp.py +0 -156
- haiku/rag/migration.py +0 -316
- haiku/rag/monitor.py +0 -73
- haiku/rag/qa/__init__.py +0 -15
- haiku/rag/qa/agent.py +0 -91
- haiku/rag/qa/prompts.py +0 -60
- haiku/rag/reader.py +0 -115
- haiku/rag/reranking/__init__.py +0 -34
- haiku/rag/reranking/base.py +0 -13
- haiku/rag/reranking/cohere.py +0 -34
- haiku/rag/reranking/mxbai.py +0 -28
- haiku/rag/reranking/vllm.py +0 -44
- haiku/rag/research/__init__.py +0 -20
- haiku/rag/research/common.py +0 -53
- haiku/rag/research/dependencies.py +0 -47
- haiku/rag/research/graph.py +0 -29
- haiku/rag/research/models.py +0 -70
- haiku/rag/research/nodes/evaluate.py +0 -80
- haiku/rag/research/nodes/plan.py +0 -63
- haiku/rag/research/nodes/search.py +0 -93
- haiku/rag/research/nodes/synthesize.py +0 -51
- haiku/rag/research/prompts.py +0 -114
- haiku/rag/research/state.py +0 -25
- haiku/rag/store/__init__.py +0 -4
- haiku/rag/store/engine.py +0 -269
- haiku/rag/store/models/__init__.py +0 -4
- haiku/rag/store/models/chunk.py +0 -17
- haiku/rag/store/models/document.py +0 -17
- haiku/rag/store/repositories/__init__.py +0 -9
- haiku/rag/store/repositories/chunk.py +0 -424
- haiku/rag/store/repositories/document.py +0 -237
- haiku/rag/store/repositories/settings.py +0 -155
- haiku/rag/store/upgrades/__init__.py +0 -62
- haiku/rag/store/upgrades/v0_10_1.py +0 -64
- haiku/rag/store/upgrades/v0_9_3.py +0 -112
- haiku/rag/utils.py +0 -199
- haiku_rag-0.10.2.dist-info/RECORD +0 -54
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/WHEEL +0 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.14.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,424 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import inspect
|
|
3
|
-
import json
|
|
4
|
-
import logging
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
|
|
7
|
-
from docling_core.types.doc.document import DoclingDocument
|
|
8
|
-
from lancedb.rerankers import RRFReranker
|
|
9
|
-
|
|
10
|
-
from haiku.rag.chunker import chunker
|
|
11
|
-
from haiku.rag.config import Config
|
|
12
|
-
from haiku.rag.embeddings import get_embedder
|
|
13
|
-
from haiku.rag.store.engine import DocumentRecord, Store
|
|
14
|
-
from haiku.rag.store.models.chunk import Chunk
|
|
15
|
-
from haiku.rag.utils import load_callable, text_to_docling_document
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class ChunkRepository:
|
|
21
|
-
"""Repository for Chunk operations."""
|
|
22
|
-
|
|
23
|
-
def __init__(self, store: Store) -> None:
|
|
24
|
-
self.store = store
|
|
25
|
-
self.embedder = get_embedder()
|
|
26
|
-
self._optimize_lock = asyncio.Lock()
|
|
27
|
-
|
|
28
|
-
def _ensure_fts_index(self) -> None:
|
|
29
|
-
"""Ensure FTS index exists on the content column."""
|
|
30
|
-
try:
|
|
31
|
-
self.store.chunks_table.create_fts_index(
|
|
32
|
-
"content", replace=True, with_position=True, remove_stop_words=False
|
|
33
|
-
)
|
|
34
|
-
except Exception as e:
|
|
35
|
-
# Log the error but don't fail - FTS might already exist
|
|
36
|
-
logger.debug(f"FTS index creation skipped: {e}")
|
|
37
|
-
|
|
38
|
-
async def _optimize(self) -> None:
|
|
39
|
-
"""Optimize the chunks table to refresh indexes."""
|
|
40
|
-
# Skip optimization for LanceDB Cloud as it handles this automatically
|
|
41
|
-
if Config.LANCEDB_URI and Config.LANCEDB_URI.startswith("db://"):
|
|
42
|
-
return
|
|
43
|
-
|
|
44
|
-
async with self._optimize_lock:
|
|
45
|
-
try:
|
|
46
|
-
self.store.chunks_table.optimize()
|
|
47
|
-
except (RuntimeError, OSError) as e:
|
|
48
|
-
# Handle "too many open files" and other resource errors gracefully
|
|
49
|
-
logger.debug(
|
|
50
|
-
f"Table optimization skipped due to resource constraints: {e}"
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
async def create(self, entity: Chunk) -> Chunk:
|
|
54
|
-
"""Create a chunk in the database."""
|
|
55
|
-
assert entity.document_id, "Chunk must have a document_id to be created"
|
|
56
|
-
|
|
57
|
-
chunk_id = str(uuid4())
|
|
58
|
-
|
|
59
|
-
# Generate embedding if not provided
|
|
60
|
-
if entity.embedding is not None:
|
|
61
|
-
embedding = entity.embedding
|
|
62
|
-
else:
|
|
63
|
-
embedding = await self.embedder.embed(entity.content)
|
|
64
|
-
order_val = int(entity.order)
|
|
65
|
-
|
|
66
|
-
chunk_record = self.store.ChunkRecord(
|
|
67
|
-
id=chunk_id,
|
|
68
|
-
document_id=entity.document_id,
|
|
69
|
-
content=entity.content,
|
|
70
|
-
metadata=json.dumps(
|
|
71
|
-
{k: v for k, v in entity.metadata.items() if k != "order"}
|
|
72
|
-
),
|
|
73
|
-
order=order_val,
|
|
74
|
-
vector=embedding,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
self.store.chunks_table.add([chunk_record])
|
|
78
|
-
|
|
79
|
-
entity.id = chunk_id
|
|
80
|
-
|
|
81
|
-
# Try to optimize if not currently locked (non-blocking)
|
|
82
|
-
if not self._optimize_lock.locked():
|
|
83
|
-
asyncio.create_task(self._optimize())
|
|
84
|
-
|
|
85
|
-
return entity
|
|
86
|
-
|
|
87
|
-
async def get_by_id(self, entity_id: str) -> Chunk | None:
|
|
88
|
-
"""Get a chunk by its ID."""
|
|
89
|
-
results = list(
|
|
90
|
-
self.store.chunks_table.search()
|
|
91
|
-
.where(f"id = '{entity_id}'")
|
|
92
|
-
.limit(1)
|
|
93
|
-
.to_pydantic(self.store.ChunkRecord)
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
if not results:
|
|
97
|
-
return None
|
|
98
|
-
|
|
99
|
-
chunk_record = results[0]
|
|
100
|
-
md = json.loads(chunk_record.metadata)
|
|
101
|
-
return Chunk(
|
|
102
|
-
id=chunk_record.id,
|
|
103
|
-
document_id=chunk_record.document_id,
|
|
104
|
-
content=chunk_record.content,
|
|
105
|
-
metadata=md,
|
|
106
|
-
order=chunk_record.order,
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
async def update(self, entity: Chunk) -> Chunk:
|
|
110
|
-
"""Update an existing chunk."""
|
|
111
|
-
assert entity.id, "Chunk ID is required for update"
|
|
112
|
-
|
|
113
|
-
embedding = await self.embedder.embed(entity.content)
|
|
114
|
-
order_val = int(entity.order)
|
|
115
|
-
|
|
116
|
-
self.store.chunks_table.update(
|
|
117
|
-
where=f"id = '{entity.id}'",
|
|
118
|
-
values={
|
|
119
|
-
"document_id": entity.document_id,
|
|
120
|
-
"content": entity.content,
|
|
121
|
-
"metadata": json.dumps(
|
|
122
|
-
{k: v for k, v in entity.metadata.items() if k != "order"}
|
|
123
|
-
),
|
|
124
|
-
"order": order_val,
|
|
125
|
-
"vector": embedding,
|
|
126
|
-
},
|
|
127
|
-
)
|
|
128
|
-
# Try to optimize if not currently locked (non-blocking)
|
|
129
|
-
if not self._optimize_lock.locked():
|
|
130
|
-
asyncio.create_task(self._optimize())
|
|
131
|
-
|
|
132
|
-
return entity
|
|
133
|
-
|
|
134
|
-
async def delete(self, entity_id: str) -> bool:
|
|
135
|
-
"""Delete a chunk by its ID."""
|
|
136
|
-
chunk = await self.get_by_id(entity_id)
|
|
137
|
-
if chunk is None:
|
|
138
|
-
return False
|
|
139
|
-
|
|
140
|
-
self.store.chunks_table.delete(f"id = '{entity_id}'")
|
|
141
|
-
return True
|
|
142
|
-
|
|
143
|
-
async def list_all(
|
|
144
|
-
self, limit: int | None = None, offset: int | None = None
|
|
145
|
-
) -> list[Chunk]:
|
|
146
|
-
"""List all chunks with optional pagination."""
|
|
147
|
-
query = self.store.chunks_table.search()
|
|
148
|
-
|
|
149
|
-
if offset is not None:
|
|
150
|
-
query = query.offset(offset)
|
|
151
|
-
if limit is not None:
|
|
152
|
-
query = query.limit(limit)
|
|
153
|
-
|
|
154
|
-
results = list(query.to_pydantic(self.store.ChunkRecord))
|
|
155
|
-
|
|
156
|
-
chunks: list[Chunk] = []
|
|
157
|
-
for rec in results:
|
|
158
|
-
md = json.loads(rec.metadata)
|
|
159
|
-
chunks.append(
|
|
160
|
-
Chunk(
|
|
161
|
-
id=rec.id,
|
|
162
|
-
document_id=rec.document_id,
|
|
163
|
-
content=rec.content,
|
|
164
|
-
metadata=md,
|
|
165
|
-
order=rec.order,
|
|
166
|
-
)
|
|
167
|
-
)
|
|
168
|
-
return chunks
|
|
169
|
-
|
|
170
|
-
async def create_chunks_for_document(
|
|
171
|
-
self, document_id: str, document: DoclingDocument
|
|
172
|
-
) -> list[Chunk]:
|
|
173
|
-
"""Create chunks and embeddings for a document from DoclingDocument."""
|
|
174
|
-
# Optionally preprocess markdown before chunking
|
|
175
|
-
processed_document = document
|
|
176
|
-
preprocessor_path = Config.MARKDOWN_PREPROCESSOR
|
|
177
|
-
if preprocessor_path:
|
|
178
|
-
try:
|
|
179
|
-
pre_fn = load_callable(preprocessor_path)
|
|
180
|
-
markdown = document.export_to_markdown()
|
|
181
|
-
result = pre_fn(markdown)
|
|
182
|
-
if inspect.isawaitable(result):
|
|
183
|
-
result = await result # type: ignore[assignment]
|
|
184
|
-
processed_markdown = result
|
|
185
|
-
if not isinstance(processed_markdown, str):
|
|
186
|
-
raise ValueError("Preprocessor must return a markdown string")
|
|
187
|
-
processed_document = text_to_docling_document(
|
|
188
|
-
processed_markdown, name="content.md"
|
|
189
|
-
)
|
|
190
|
-
except Exception as e:
|
|
191
|
-
logger.error(
|
|
192
|
-
f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
|
|
193
|
-
)
|
|
194
|
-
raise e
|
|
195
|
-
|
|
196
|
-
chunk_texts = await chunker.chunk(processed_document)
|
|
197
|
-
|
|
198
|
-
embeddings = await self.embedder.embed(chunk_texts)
|
|
199
|
-
|
|
200
|
-
# Prepare all chunk records for batch insertion
|
|
201
|
-
chunk_records = []
|
|
202
|
-
created_chunks = []
|
|
203
|
-
|
|
204
|
-
for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
|
|
205
|
-
chunk_id = str(uuid4())
|
|
206
|
-
|
|
207
|
-
chunk_record = self.store.ChunkRecord(
|
|
208
|
-
id=chunk_id,
|
|
209
|
-
document_id=document_id,
|
|
210
|
-
content=chunk_text,
|
|
211
|
-
metadata=json.dumps({}),
|
|
212
|
-
order=order,
|
|
213
|
-
vector=embedding,
|
|
214
|
-
)
|
|
215
|
-
chunk_records.append(chunk_record)
|
|
216
|
-
|
|
217
|
-
chunk = Chunk(
|
|
218
|
-
id=chunk_id,
|
|
219
|
-
document_id=document_id,
|
|
220
|
-
content=chunk_text,
|
|
221
|
-
metadata={},
|
|
222
|
-
order=order,
|
|
223
|
-
)
|
|
224
|
-
created_chunks.append(chunk)
|
|
225
|
-
|
|
226
|
-
# Batch insert all chunks at once
|
|
227
|
-
if chunk_records:
|
|
228
|
-
self.store.chunks_table.add(chunk_records)
|
|
229
|
-
|
|
230
|
-
# Force optimization once at the end for bulk operations
|
|
231
|
-
await self._optimize()
|
|
232
|
-
return created_chunks
|
|
233
|
-
|
|
234
|
-
async def delete_all(self) -> None:
|
|
235
|
-
"""Delete all chunks from the database."""
|
|
236
|
-
# Drop and recreate table to clear all data
|
|
237
|
-
self.store.db.drop_table("chunks")
|
|
238
|
-
self.store.chunks_table = self.store.db.create_table(
|
|
239
|
-
"chunks", schema=self.store.ChunkRecord
|
|
240
|
-
)
|
|
241
|
-
# Create FTS index on the new table with phrase query support
|
|
242
|
-
self.store.chunks_table.create_fts_index(
|
|
243
|
-
"content", replace=True, with_position=True, remove_stop_words=False
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
async def delete_by_document_id(self, document_id: str) -> bool:
|
|
247
|
-
"""Delete all chunks for a document."""
|
|
248
|
-
chunks = await self.get_by_document_id(document_id)
|
|
249
|
-
|
|
250
|
-
if not chunks:
|
|
251
|
-
return False
|
|
252
|
-
|
|
253
|
-
self.store.chunks_table.delete(f"document_id = '{document_id}'")
|
|
254
|
-
return True
|
|
255
|
-
|
|
256
|
-
async def search(
|
|
257
|
-
self, query: str, limit: int = 5, search_type: str = "hybrid"
|
|
258
|
-
) -> list[tuple[Chunk, float]]:
|
|
259
|
-
"""Search for relevant chunks using the specified search method.
|
|
260
|
-
|
|
261
|
-
Args:
|
|
262
|
-
query: The search query string.
|
|
263
|
-
limit: Maximum number of results to return.
|
|
264
|
-
search_type: Type of search - "vector", "fts", or "hybrid" (default).
|
|
265
|
-
|
|
266
|
-
Returns:
|
|
267
|
-
List of (chunk, score) tuples ordered by relevance.
|
|
268
|
-
"""
|
|
269
|
-
if not query.strip():
|
|
270
|
-
return []
|
|
271
|
-
|
|
272
|
-
if search_type == "vector":
|
|
273
|
-
query_embedding = await self.embedder.embed(query)
|
|
274
|
-
|
|
275
|
-
results = self.store.chunks_table.search(
|
|
276
|
-
query_embedding, query_type="vector", vector_column_name="vector"
|
|
277
|
-
).limit(limit)
|
|
278
|
-
|
|
279
|
-
return await self._process_search_results(results)
|
|
280
|
-
|
|
281
|
-
elif search_type == "fts":
|
|
282
|
-
results = self.store.chunks_table.search(query, query_type="fts").limit(
|
|
283
|
-
limit
|
|
284
|
-
)
|
|
285
|
-
return await self._process_search_results(results)
|
|
286
|
-
|
|
287
|
-
else: # hybrid (default)
|
|
288
|
-
query_embedding = await self.embedder.embed(query)
|
|
289
|
-
|
|
290
|
-
# Create RRF reranker
|
|
291
|
-
reranker = RRFReranker()
|
|
292
|
-
|
|
293
|
-
# Perform native hybrid search with RRF reranking
|
|
294
|
-
results = (
|
|
295
|
-
self.store.chunks_table.search(query_type="hybrid")
|
|
296
|
-
.vector(query_embedding)
|
|
297
|
-
.text(query)
|
|
298
|
-
.rerank(reranker)
|
|
299
|
-
.limit(limit)
|
|
300
|
-
)
|
|
301
|
-
return await self._process_search_results(results)
|
|
302
|
-
|
|
303
|
-
async def get_by_document_id(self, document_id: str) -> list[Chunk]:
|
|
304
|
-
"""Get all chunks for a specific document."""
|
|
305
|
-
results = list(
|
|
306
|
-
self.store.chunks_table.search()
|
|
307
|
-
.where(f"document_id = '{document_id}'")
|
|
308
|
-
.to_pydantic(self.store.ChunkRecord)
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
# Get document info
|
|
312
|
-
doc_results = list(
|
|
313
|
-
self.store.documents_table.search()
|
|
314
|
-
.where(f"id = '{document_id}'")
|
|
315
|
-
.limit(1)
|
|
316
|
-
.to_pydantic(DocumentRecord)
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
doc_uri = doc_results[0].uri if doc_results else None
|
|
320
|
-
doc_title = doc_results[0].title if doc_results else None
|
|
321
|
-
doc_meta = doc_results[0].metadata if doc_results else "{}"
|
|
322
|
-
|
|
323
|
-
chunks: list[Chunk] = []
|
|
324
|
-
for rec in results:
|
|
325
|
-
md = json.loads(rec.metadata)
|
|
326
|
-
chunks.append(
|
|
327
|
-
Chunk(
|
|
328
|
-
id=rec.id,
|
|
329
|
-
document_id=rec.document_id,
|
|
330
|
-
content=rec.content,
|
|
331
|
-
metadata=md,
|
|
332
|
-
order=rec.order,
|
|
333
|
-
document_uri=doc_uri,
|
|
334
|
-
document_title=doc_title,
|
|
335
|
-
document_meta=json.loads(doc_meta),
|
|
336
|
-
)
|
|
337
|
-
)
|
|
338
|
-
|
|
339
|
-
chunks.sort(key=lambda c: c.order)
|
|
340
|
-
return chunks
|
|
341
|
-
|
|
342
|
-
async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
|
|
343
|
-
"""Get adjacent chunks before and after the given chunk within the same document."""
|
|
344
|
-
assert chunk.document_id, "Document id is required for adjacent chunk finding"
|
|
345
|
-
|
|
346
|
-
chunk_order = chunk.order
|
|
347
|
-
|
|
348
|
-
# Fetch chunks for the same document and filter by order proximity
|
|
349
|
-
all_chunks = await self.get_by_document_id(chunk.document_id)
|
|
350
|
-
|
|
351
|
-
adjacent_chunks: list[Chunk] = []
|
|
352
|
-
for c in all_chunks:
|
|
353
|
-
c_order = c.order
|
|
354
|
-
if c.id != chunk.id and abs(c_order - chunk_order) <= num_adjacent:
|
|
355
|
-
adjacent_chunks.append(c)
|
|
356
|
-
|
|
357
|
-
return adjacent_chunks
|
|
358
|
-
|
|
359
|
-
async def _process_search_results(self, query_result) -> list[tuple[Chunk, float]]:
|
|
360
|
-
"""Process search results into chunks with document info and scores."""
|
|
361
|
-
chunks_with_scores = []
|
|
362
|
-
|
|
363
|
-
# Get both arrow and pydantic results to access scores
|
|
364
|
-
arrow_result = query_result.to_arrow()
|
|
365
|
-
pydantic_results = list(query_result.to_pydantic(self.store.ChunkRecord))
|
|
366
|
-
|
|
367
|
-
# Extract scores from arrow result based on search type
|
|
368
|
-
scores = []
|
|
369
|
-
column_names = arrow_result.column_names
|
|
370
|
-
|
|
371
|
-
if "_distance" in column_names:
|
|
372
|
-
# Vector search - distance (lower is better, convert to similarity)
|
|
373
|
-
distances = arrow_result.column("_distance").to_pylist()
|
|
374
|
-
scores = [max(0.0, 1.0 / (1.0 + dist)) for dist in distances]
|
|
375
|
-
elif "_relevance_score" in column_names:
|
|
376
|
-
# Hybrid search - relevance score (higher is better)
|
|
377
|
-
scores = arrow_result.column("_relevance_score").to_pylist()
|
|
378
|
-
elif "_score" in column_names:
|
|
379
|
-
# FTS search - score (higher is better)
|
|
380
|
-
scores = arrow_result.column("_score").to_pylist()
|
|
381
|
-
else:
|
|
382
|
-
raise ValueError("Unknown search result format, cannot extract scores")
|
|
383
|
-
|
|
384
|
-
# Collect all unique document IDs for batch lookup
|
|
385
|
-
document_ids = list(set(chunk.document_id for chunk in pydantic_results))
|
|
386
|
-
|
|
387
|
-
# Batch fetch all documents at once
|
|
388
|
-
documents_map = {}
|
|
389
|
-
if document_ids:
|
|
390
|
-
# Create a WHERE clause for all document IDs
|
|
391
|
-
where_clause = " OR ".join(f"id = '{doc_id}'" for doc_id in document_ids)
|
|
392
|
-
doc_results = list(
|
|
393
|
-
self.store.documents_table.search()
|
|
394
|
-
.where(where_clause)
|
|
395
|
-
.to_pydantic(DocumentRecord)
|
|
396
|
-
)
|
|
397
|
-
documents_map = {doc.id: doc for doc in doc_results}
|
|
398
|
-
|
|
399
|
-
for i, chunk_record in enumerate(pydantic_results):
|
|
400
|
-
# Get document info from pre-fetched map
|
|
401
|
-
doc = documents_map.get(chunk_record.document_id)
|
|
402
|
-
doc_uri = doc.uri if doc else None
|
|
403
|
-
doc_title = doc.title if doc else None
|
|
404
|
-
doc_meta = doc.metadata if doc else "{}"
|
|
405
|
-
|
|
406
|
-
md = json.loads(chunk_record.metadata)
|
|
407
|
-
|
|
408
|
-
chunk = Chunk(
|
|
409
|
-
id=chunk_record.id,
|
|
410
|
-
document_id=chunk_record.document_id,
|
|
411
|
-
content=chunk_record.content,
|
|
412
|
-
metadata=md,
|
|
413
|
-
order=chunk_record.order,
|
|
414
|
-
document_uri=doc_uri,
|
|
415
|
-
document_title=doc_title,
|
|
416
|
-
document_meta=json.loads(doc_meta),
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
# Get score from arrow result
|
|
420
|
-
score = scores[i] if i < len(scores) else 1.0
|
|
421
|
-
|
|
422
|
-
chunks_with_scores.append((chunk, score))
|
|
423
|
-
|
|
424
|
-
return chunks_with_scores
|
|
@@ -1,237 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from datetime import datetime
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
|
-
from uuid import uuid4
|
|
5
|
-
|
|
6
|
-
from docling_core.types.doc.document import DoclingDocument
|
|
7
|
-
|
|
8
|
-
from haiku.rag.store.engine import DocumentRecord, Store
|
|
9
|
-
from haiku.rag.store.models.document import Document
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from haiku.rag.store.models.chunk import Chunk
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class DocumentRepository:
|
|
16
|
-
"""Repository for Document operations."""
|
|
17
|
-
|
|
18
|
-
def __init__(self, store: Store) -> None:
|
|
19
|
-
self.store = store
|
|
20
|
-
self._chunk_repository = None
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def chunk_repository(self):
|
|
24
|
-
"""Lazy-load ChunkRepository when needed."""
|
|
25
|
-
if self._chunk_repository is None:
|
|
26
|
-
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
27
|
-
|
|
28
|
-
self._chunk_repository = ChunkRepository(self.store)
|
|
29
|
-
return self._chunk_repository
|
|
30
|
-
|
|
31
|
-
def _record_to_document(self, record: DocumentRecord) -> Document:
|
|
32
|
-
"""Convert a DocumentRecord to a Document model."""
|
|
33
|
-
return Document(
|
|
34
|
-
id=record.id,
|
|
35
|
-
content=record.content,
|
|
36
|
-
uri=record.uri,
|
|
37
|
-
title=record.title,
|
|
38
|
-
metadata=json.loads(record.metadata),
|
|
39
|
-
created_at=datetime.fromisoformat(record.created_at)
|
|
40
|
-
if record.created_at
|
|
41
|
-
else datetime.now(),
|
|
42
|
-
updated_at=datetime.fromisoformat(record.updated_at)
|
|
43
|
-
if record.updated_at
|
|
44
|
-
else datetime.now(),
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
async def create(self, entity: Document) -> Document:
|
|
48
|
-
"""Create a document in the database."""
|
|
49
|
-
# Generate new UUID
|
|
50
|
-
doc_id = str(uuid4())
|
|
51
|
-
|
|
52
|
-
# Create timestamp
|
|
53
|
-
now = datetime.now().isoformat()
|
|
54
|
-
|
|
55
|
-
# Create document record
|
|
56
|
-
doc_record = DocumentRecord(
|
|
57
|
-
id=doc_id,
|
|
58
|
-
content=entity.content,
|
|
59
|
-
uri=entity.uri,
|
|
60
|
-
title=entity.title,
|
|
61
|
-
metadata=json.dumps(entity.metadata),
|
|
62
|
-
created_at=now,
|
|
63
|
-
updated_at=now,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
# Add to table
|
|
67
|
-
self.store.documents_table.add([doc_record])
|
|
68
|
-
|
|
69
|
-
entity.id = doc_id
|
|
70
|
-
entity.created_at = datetime.fromisoformat(now)
|
|
71
|
-
entity.updated_at = datetime.fromisoformat(now)
|
|
72
|
-
return entity
|
|
73
|
-
|
|
74
|
-
async def get_by_id(self, entity_id: str) -> Document | None:
|
|
75
|
-
"""Get a document by its ID."""
|
|
76
|
-
results = list(
|
|
77
|
-
self.store.documents_table.search()
|
|
78
|
-
.where(f"id = '{entity_id}'")
|
|
79
|
-
.limit(1)
|
|
80
|
-
.to_pydantic(DocumentRecord)
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
if not results:
|
|
84
|
-
return None
|
|
85
|
-
|
|
86
|
-
return self._record_to_document(results[0])
|
|
87
|
-
|
|
88
|
-
async def update(self, entity: Document) -> Document:
|
|
89
|
-
"""Update an existing document."""
|
|
90
|
-
assert entity.id, "Document ID is required for update"
|
|
91
|
-
|
|
92
|
-
# Update timestamp
|
|
93
|
-
now = datetime.now().isoformat()
|
|
94
|
-
entity.updated_at = datetime.fromisoformat(now)
|
|
95
|
-
|
|
96
|
-
# Update the record
|
|
97
|
-
self.store.documents_table.update(
|
|
98
|
-
where=f"id = '{entity.id}'",
|
|
99
|
-
values={
|
|
100
|
-
"content": entity.content,
|
|
101
|
-
"uri": entity.uri,
|
|
102
|
-
"title": entity.title,
|
|
103
|
-
"metadata": json.dumps(entity.metadata),
|
|
104
|
-
"updated_at": now,
|
|
105
|
-
},
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
return entity
|
|
109
|
-
|
|
110
|
-
async def delete(self, entity_id: str) -> bool:
|
|
111
|
-
"""Delete a document by its ID."""
|
|
112
|
-
# Check if document exists
|
|
113
|
-
doc = await self.get_by_id(entity_id)
|
|
114
|
-
if doc is None:
|
|
115
|
-
return False
|
|
116
|
-
|
|
117
|
-
# Delete associated chunks first
|
|
118
|
-
await self.chunk_repository.delete_by_document_id(entity_id)
|
|
119
|
-
|
|
120
|
-
# Delete the document
|
|
121
|
-
self.store.documents_table.delete(f"id = '{entity_id}'")
|
|
122
|
-
return True
|
|
123
|
-
|
|
124
|
-
async def list_all(
|
|
125
|
-
self, limit: int | None = None, offset: int | None = None
|
|
126
|
-
) -> list[Document]:
|
|
127
|
-
"""List all documents with optional pagination."""
|
|
128
|
-
query = self.store.documents_table.search()
|
|
129
|
-
|
|
130
|
-
if offset is not None:
|
|
131
|
-
query = query.offset(offset)
|
|
132
|
-
if limit is not None:
|
|
133
|
-
query = query.limit(limit)
|
|
134
|
-
|
|
135
|
-
results = list(query.to_pydantic(DocumentRecord))
|
|
136
|
-
return [self._record_to_document(doc) for doc in results]
|
|
137
|
-
|
|
138
|
-
async def get_by_uri(self, uri: str) -> Document | None:
|
|
139
|
-
"""Get a document by its URI."""
|
|
140
|
-
results = list(
|
|
141
|
-
self.store.documents_table.search()
|
|
142
|
-
.where(f"uri = '{uri}'")
|
|
143
|
-
.limit(1)
|
|
144
|
-
.to_pydantic(DocumentRecord)
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
if not results:
|
|
148
|
-
return None
|
|
149
|
-
|
|
150
|
-
return self._record_to_document(results[0])
|
|
151
|
-
|
|
152
|
-
async def delete_all(self) -> None:
|
|
153
|
-
"""Delete all documents from the database."""
|
|
154
|
-
# Delete all chunks first
|
|
155
|
-
await self.chunk_repository.delete_all()
|
|
156
|
-
|
|
157
|
-
# Get count before deletion
|
|
158
|
-
count = len(
|
|
159
|
-
list(
|
|
160
|
-
self.store.documents_table.search().limit(1).to_pydantic(DocumentRecord)
|
|
161
|
-
)
|
|
162
|
-
)
|
|
163
|
-
if count > 0:
|
|
164
|
-
# Drop and recreate table to clear all data
|
|
165
|
-
self.store.db.drop_table("documents")
|
|
166
|
-
self.store.documents_table = self.store.db.create_table(
|
|
167
|
-
"documents", schema=DocumentRecord
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
async def _create_with_docling(
|
|
171
|
-
self,
|
|
172
|
-
entity: Document,
|
|
173
|
-
docling_document: DoclingDocument,
|
|
174
|
-
chunks: list["Chunk"] | None = None,
|
|
175
|
-
) -> Document:
|
|
176
|
-
"""Create a document with its chunks and embeddings."""
|
|
177
|
-
# Snapshot table versions for versioned rollback (if supported)
|
|
178
|
-
versions = self.store.current_table_versions()
|
|
179
|
-
|
|
180
|
-
# Create the document
|
|
181
|
-
created_doc = await self.create(entity)
|
|
182
|
-
|
|
183
|
-
# Attempt to create chunks; on failure, prefer version rollback
|
|
184
|
-
try:
|
|
185
|
-
# Create chunks if not provided
|
|
186
|
-
if chunks is None:
|
|
187
|
-
assert created_doc.id is not None, (
|
|
188
|
-
"Document ID should not be None after creation"
|
|
189
|
-
)
|
|
190
|
-
await self.chunk_repository.create_chunks_for_document(
|
|
191
|
-
created_doc.id, docling_document
|
|
192
|
-
)
|
|
193
|
-
else:
|
|
194
|
-
# Use provided chunks, set order from list position
|
|
195
|
-
assert created_doc.id is not None, (
|
|
196
|
-
"Document ID should not be None after creation"
|
|
197
|
-
)
|
|
198
|
-
for order, chunk in enumerate(chunks):
|
|
199
|
-
chunk.document_id = created_doc.id
|
|
200
|
-
chunk.order = order
|
|
201
|
-
await self.chunk_repository.create(chunk)
|
|
202
|
-
|
|
203
|
-
return created_doc
|
|
204
|
-
except Exception:
|
|
205
|
-
# Roll back to the captured versions and re-raise
|
|
206
|
-
self.store.restore_table_versions(versions)
|
|
207
|
-
raise
|
|
208
|
-
|
|
209
|
-
async def _update_with_docling(
|
|
210
|
-
self, entity: Document, docling_document: DoclingDocument
|
|
211
|
-
) -> Document:
|
|
212
|
-
"""Update a document and regenerate its chunks."""
|
|
213
|
-
assert entity.id is not None, "Document ID is required for update"
|
|
214
|
-
|
|
215
|
-
# Snapshot table versions for versioned rollback
|
|
216
|
-
versions = self.store.current_table_versions()
|
|
217
|
-
|
|
218
|
-
# Delete existing chunks before writing new ones
|
|
219
|
-
await self.chunk_repository.delete_by_document_id(entity.id)
|
|
220
|
-
|
|
221
|
-
try:
|
|
222
|
-
# Update the document
|
|
223
|
-
updated_doc = await self.update(entity)
|
|
224
|
-
|
|
225
|
-
# Create new chunks
|
|
226
|
-
assert updated_doc.id is not None, (
|
|
227
|
-
"Document ID should not be None after update"
|
|
228
|
-
)
|
|
229
|
-
await self.chunk_repository.create_chunks_for_document(
|
|
230
|
-
updated_doc.id, docling_document
|
|
231
|
-
)
|
|
232
|
-
|
|
233
|
-
return updated_doc
|
|
234
|
-
except Exception:
|
|
235
|
-
# Roll back to the captured versions and re-raise
|
|
236
|
-
self.store.restore_table_versions(versions)
|
|
237
|
-
raise
|