haiku.rag-slim 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (71) hide show
  1. haiku/rag/__init__.py +0 -0
  2. haiku/rag/app.py +542 -0
  3. haiku/rag/chunker.py +65 -0
  4. haiku/rag/cli.py +466 -0
  5. haiku/rag/client.py +731 -0
  6. haiku/rag/config/__init__.py +74 -0
  7. haiku/rag/config/loader.py +94 -0
  8. haiku/rag/config/models.py +99 -0
  9. haiku/rag/embeddings/__init__.py +49 -0
  10. haiku/rag/embeddings/base.py +25 -0
  11. haiku/rag/embeddings/ollama.py +28 -0
  12. haiku/rag/embeddings/openai.py +26 -0
  13. haiku/rag/embeddings/vllm.py +29 -0
  14. haiku/rag/embeddings/voyageai.py +27 -0
  15. haiku/rag/graph/__init__.py +26 -0
  16. haiku/rag/graph/agui/__init__.py +53 -0
  17. haiku/rag/graph/agui/cli_renderer.py +135 -0
  18. haiku/rag/graph/agui/emitter.py +197 -0
  19. haiku/rag/graph/agui/events.py +254 -0
  20. haiku/rag/graph/agui/server.py +310 -0
  21. haiku/rag/graph/agui/state.py +34 -0
  22. haiku/rag/graph/agui/stream.py +86 -0
  23. haiku/rag/graph/common/__init__.py +5 -0
  24. haiku/rag/graph/common/models.py +42 -0
  25. haiku/rag/graph/common/nodes.py +265 -0
  26. haiku/rag/graph/common/prompts.py +46 -0
  27. haiku/rag/graph/common/utils.py +44 -0
  28. haiku/rag/graph/deep_qa/__init__.py +1 -0
  29. haiku/rag/graph/deep_qa/dependencies.py +27 -0
  30. haiku/rag/graph/deep_qa/graph.py +243 -0
  31. haiku/rag/graph/deep_qa/models.py +20 -0
  32. haiku/rag/graph/deep_qa/prompts.py +59 -0
  33. haiku/rag/graph/deep_qa/state.py +56 -0
  34. haiku/rag/graph/research/__init__.py +3 -0
  35. haiku/rag/graph/research/common.py +87 -0
  36. haiku/rag/graph/research/dependencies.py +151 -0
  37. haiku/rag/graph/research/graph.py +295 -0
  38. haiku/rag/graph/research/models.py +166 -0
  39. haiku/rag/graph/research/prompts.py +107 -0
  40. haiku/rag/graph/research/state.py +85 -0
  41. haiku/rag/logging.py +56 -0
  42. haiku/rag/mcp.py +245 -0
  43. haiku/rag/monitor.py +194 -0
  44. haiku/rag/qa/__init__.py +33 -0
  45. haiku/rag/qa/agent.py +93 -0
  46. haiku/rag/qa/prompts.py +60 -0
  47. haiku/rag/reader.py +135 -0
  48. haiku/rag/reranking/__init__.py +63 -0
  49. haiku/rag/reranking/base.py +13 -0
  50. haiku/rag/reranking/cohere.py +34 -0
  51. haiku/rag/reranking/mxbai.py +28 -0
  52. haiku/rag/reranking/vllm.py +44 -0
  53. haiku/rag/reranking/zeroentropy.py +59 -0
  54. haiku/rag/store/__init__.py +4 -0
  55. haiku/rag/store/engine.py +309 -0
  56. haiku/rag/store/models/__init__.py +4 -0
  57. haiku/rag/store/models/chunk.py +17 -0
  58. haiku/rag/store/models/document.py +17 -0
  59. haiku/rag/store/repositories/__init__.py +9 -0
  60. haiku/rag/store/repositories/chunk.py +442 -0
  61. haiku/rag/store/repositories/document.py +261 -0
  62. haiku/rag/store/repositories/settings.py +165 -0
  63. haiku/rag/store/upgrades/__init__.py +62 -0
  64. haiku/rag/store/upgrades/v0_10_1.py +64 -0
  65. haiku/rag/store/upgrades/v0_9_3.py +112 -0
  66. haiku/rag/utils.py +211 -0
  67. haiku_rag_slim-0.16.0.dist-info/METADATA +128 -0
  68. haiku_rag_slim-0.16.0.dist-info/RECORD +71 -0
  69. haiku_rag_slim-0.16.0.dist-info/WHEEL +4 -0
  70. haiku_rag_slim-0.16.0.dist-info/entry_points.txt +2 -0
  71. haiku_rag_slim-0.16.0.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,442 @@
1
+ import inspect
2
+ import json
3
+ import logging
4
+ from typing import TYPE_CHECKING
5
+ from uuid import uuid4
6
+
7
+ if TYPE_CHECKING:
8
+ import pandas as pd
9
+ from lancedb.query import LanceQueryBuilder
10
+
11
+ from lancedb.rerankers import RRFReranker
12
+
13
+ from haiku.rag.store.engine import DocumentRecord, Store
14
+ from haiku.rag.store.models.chunk import Chunk
15
+ from haiku.rag.utils import load_callable
16
+
17
+ if TYPE_CHECKING:
18
+ from docling_core.types.doc.document import DoclingDocument
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class ChunkRepository:
24
+ """Repository for Chunk operations."""
25
+
26
+ def __init__(self, store: Store) -> None:
27
+ self.store = store
28
+ self.embedder = store.embedder
29
+
30
+ def _ensure_fts_index(self) -> None:
31
+ """Ensure FTS index exists on the content column."""
32
+ try:
33
+ self.store.chunks_table.create_fts_index(
34
+ "content", replace=True, with_position=True, remove_stop_words=False
35
+ )
36
+ except Exception as e:
37
+ # Log the error but don't fail - FTS might already exist
38
+ logger.debug(f"FTS index creation skipped: {e}")
39
+
40
+ async def create(self, entity: Chunk) -> Chunk:
41
+ """Create a chunk in the database."""
42
+ assert entity.document_id, "Chunk must have a document_id to be created"
43
+
44
+ chunk_id = str(uuid4())
45
+
46
+ # Generate embedding if not provided
47
+ if entity.embedding is not None:
48
+ embedding = entity.embedding
49
+ else:
50
+ embedding = await self.embedder.embed(entity.content)
51
+ order_val = int(entity.order)
52
+
53
+ chunk_record = self.store.ChunkRecord(
54
+ id=chunk_id,
55
+ document_id=entity.document_id,
56
+ content=entity.content,
57
+ metadata=json.dumps(
58
+ {k: v for k, v in entity.metadata.items() if k != "order"}
59
+ ),
60
+ order=order_val,
61
+ vector=embedding,
62
+ )
63
+
64
+ self.store.chunks_table.add([chunk_record])
65
+
66
+ entity.id = chunk_id
67
+ return entity
68
+
69
+ async def get_by_id(self, entity_id: str) -> Chunk | None:
70
+ """Get a chunk by its ID."""
71
+ results = list(
72
+ self.store.chunks_table.search()
73
+ .where(f"id = '{entity_id}'")
74
+ .limit(1)
75
+ .to_pydantic(self.store.ChunkRecord)
76
+ )
77
+
78
+ if not results:
79
+ return None
80
+
81
+ chunk_record = results[0]
82
+ md = json.loads(chunk_record.metadata)
83
+ return Chunk(
84
+ id=chunk_record.id,
85
+ document_id=chunk_record.document_id,
86
+ content=chunk_record.content,
87
+ metadata=md,
88
+ order=chunk_record.order,
89
+ )
90
+
91
+ async def update(self, entity: Chunk) -> Chunk:
92
+ """Update an existing chunk."""
93
+ assert entity.id, "Chunk ID is required for update"
94
+
95
+ embedding = await self.embedder.embed(entity.content)
96
+ order_val = int(entity.order)
97
+
98
+ self.store.chunks_table.update(
99
+ where=f"id = '{entity.id}'",
100
+ values={
101
+ "document_id": entity.document_id,
102
+ "content": entity.content,
103
+ "metadata": json.dumps(
104
+ {k: v for k, v in entity.metadata.items() if k != "order"}
105
+ ),
106
+ "order": order_val,
107
+ "vector": embedding,
108
+ },
109
+ )
110
+ return entity
111
+
112
+ async def delete(self, entity_id: str) -> bool:
113
+ """Delete a chunk by its ID."""
114
+ chunk = await self.get_by_id(entity_id)
115
+ if chunk is None:
116
+ return False
117
+
118
+ self.store.chunks_table.delete(f"id = '{entity_id}'")
119
+ return True
120
+
121
+ async def list_all(
122
+ self, limit: int | None = None, offset: int | None = None
123
+ ) -> list[Chunk]:
124
+ """List all chunks with optional pagination."""
125
+ query = self.store.chunks_table.search()
126
+
127
+ if offset is not None:
128
+ query = query.offset(offset)
129
+ if limit is not None:
130
+ query = query.limit(limit)
131
+
132
+ results = list(query.to_pydantic(self.store.ChunkRecord))
133
+
134
+ chunks: list[Chunk] = []
135
+ for rec in results:
136
+ md = json.loads(rec.metadata)
137
+ chunks.append(
138
+ Chunk(
139
+ id=rec.id,
140
+ document_id=rec.document_id,
141
+ content=rec.content,
142
+ metadata=md,
143
+ order=rec.order,
144
+ )
145
+ )
146
+ return chunks
147
+
148
+ async def create_chunks_for_document(
149
+ self, document_id: str, document: "DoclingDocument"
150
+ ) -> list[Chunk]:
151
+ """Create chunks and embeddings for a document from DoclingDocument."""
152
+ # Lazy imports to avoid loading docling during module import
153
+ from haiku.rag.chunker import chunker
154
+ from haiku.rag.utils import text_to_docling_document
155
+
156
+ # Optionally preprocess markdown before chunking
157
+ processed_document = document
158
+ preprocessor_path = self.store._config.processing.markdown_preprocessor
159
+ if preprocessor_path:
160
+ try:
161
+ pre_fn = load_callable(preprocessor_path)
162
+ markdown = document.export_to_markdown()
163
+ result = pre_fn(markdown)
164
+ if inspect.isawaitable(result):
165
+ result = await result # type: ignore[assignment]
166
+ processed_markdown = result
167
+ if not isinstance(processed_markdown, str):
168
+ raise ValueError("Preprocessor must return a markdown string")
169
+ processed_document = text_to_docling_document(
170
+ processed_markdown, name="content.md"
171
+ )
172
+ except Exception as e:
173
+ logger.error(
174
+ f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
175
+ )
176
+ raise e
177
+
178
+ chunk_texts = await chunker.chunk(processed_document)
179
+
180
+ embeddings = await self.embedder.embed(chunk_texts)
181
+
182
+ # Prepare all chunk records for batch insertion
183
+ chunk_records = []
184
+ created_chunks = []
185
+
186
+ for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
187
+ chunk_id = str(uuid4())
188
+
189
+ chunk_record = self.store.ChunkRecord(
190
+ id=chunk_id,
191
+ document_id=document_id,
192
+ content=chunk_text,
193
+ metadata=json.dumps({}),
194
+ order=order,
195
+ vector=embedding,
196
+ )
197
+ chunk_records.append(chunk_record)
198
+
199
+ chunk = Chunk(
200
+ id=chunk_id,
201
+ document_id=document_id,
202
+ content=chunk_text,
203
+ metadata={},
204
+ order=order,
205
+ )
206
+ created_chunks.append(chunk)
207
+
208
+ # Batch insert all chunks at once
209
+ if chunk_records:
210
+ self.store.chunks_table.add(chunk_records)
211
+
212
+ return created_chunks
213
+
214
+ async def delete_all(self) -> None:
215
+ """Delete all chunks from the database."""
216
+ # Drop and recreate table to clear all data
217
+ self.store.db.drop_table("chunks")
218
+ self.store.chunks_table = self.store.db.create_table(
219
+ "chunks", schema=self.store.ChunkRecord
220
+ )
221
+ # Create FTS index on the new table with phrase query support
222
+ self.store.chunks_table.create_fts_index(
223
+ "content", replace=True, with_position=True, remove_stop_words=False
224
+ )
225
+
226
+ async def delete_by_document_id(self, document_id: str) -> bool:
227
+ """Delete all chunks for a document."""
228
+ chunks = await self.get_by_document_id(document_id)
229
+
230
+ if not chunks:
231
+ return False
232
+
233
+ self.store.chunks_table.delete(f"document_id = '{document_id}'")
234
+ return True
235
+
236
+ async def search(
237
+ self,
238
+ query: str,
239
+ limit: int = 5,
240
+ search_type: str = "hybrid",
241
+ filter: str | None = None,
242
+ ) -> list[tuple[Chunk, float]]:
243
+ """Search for relevant chunks using the specified search method.
244
+
245
+ Args:
246
+ query: The search query string.
247
+ limit: Maximum number of results to return.
248
+ search_type: Type of search - "vector", "fts", or "hybrid" (default).
249
+ filter: Optional SQL WHERE clause to filter documents before searching chunks.
250
+
251
+ Returns:
252
+ List of (chunk, score) tuples ordered by relevance.
253
+ """
254
+ if not query.strip():
255
+ return []
256
+ filtered_doc_ids = None
257
+ if filter:
258
+ # We perform filtering as a two-step process, first filtering documents, then
259
+ # filtering chunks based on those document IDs.
260
+ # This is because LanceDB does not support joins directly in search queries.
261
+ docs_df = (
262
+ self.store.documents_table.search()
263
+ .select(["id"])
264
+ .where(filter)
265
+ .to_pandas()
266
+ )
267
+ # Early exit if no documents match the filter
268
+ if docs_df.empty:
269
+ return []
270
+ # Keep as pandas Series for efficient vectorized operations
271
+ filtered_doc_ids = docs_df["id"]
272
+
273
+ # Prepare search query based on search type
274
+ if search_type == "vector":
275
+ query_embedding = await self.embedder.embed(query)
276
+ results = self.store.chunks_table.search(
277
+ query_embedding, query_type="vector", vector_column_name="vector"
278
+ )
279
+
280
+ elif search_type == "fts":
281
+ results = self.store.chunks_table.search(query, query_type="fts")
282
+
283
+ else: # hybrid (default)
284
+ query_embedding = await self.embedder.embed(query)
285
+ # Create RRF reranker
286
+ reranker = RRFReranker()
287
+ # Perform native hybrid search with RRF reranking
288
+ results = (
289
+ self.store.chunks_table.search(query_type="hybrid")
290
+ .vector(query_embedding)
291
+ .text(query)
292
+ .rerank(reranker)
293
+ )
294
+
295
+ # Apply filtering if needed (common for all search types)
296
+ if filtered_doc_ids is not None:
297
+ chunks_df = results.to_pandas()
298
+ filtered_chunks_df = chunks_df.loc[
299
+ chunks_df["document_id"].isin(filtered_doc_ids)
300
+ ].head(limit)
301
+ return await self._process_search_results(filtered_chunks_df)
302
+
303
+ # No filtering needed, apply limit and return
304
+ results = results.limit(limit)
305
+ return await self._process_search_results(results)
306
+
307
+ async def get_by_document_id(self, document_id: str) -> list[Chunk]:
308
+ """Get all chunks for a specific document."""
309
+ results = list(
310
+ self.store.chunks_table.search()
311
+ .where(f"document_id = '{document_id}'")
312
+ .to_pydantic(self.store.ChunkRecord)
313
+ )
314
+
315
+ # Get document info
316
+ doc_results = list(
317
+ self.store.documents_table.search()
318
+ .where(f"id = '{document_id}'")
319
+ .limit(1)
320
+ .to_pydantic(DocumentRecord)
321
+ )
322
+
323
+ doc_uri = doc_results[0].uri if doc_results else None
324
+ doc_title = doc_results[0].title if doc_results else None
325
+ doc_meta = doc_results[0].metadata if doc_results else "{}"
326
+
327
+ chunks: list[Chunk] = []
328
+ for rec in results:
329
+ md = json.loads(rec.metadata)
330
+ chunks.append(
331
+ Chunk(
332
+ id=rec.id,
333
+ document_id=rec.document_id,
334
+ content=rec.content,
335
+ metadata=md,
336
+ order=rec.order,
337
+ document_uri=doc_uri,
338
+ document_title=doc_title,
339
+ document_meta=json.loads(doc_meta),
340
+ )
341
+ )
342
+
343
+ chunks.sort(key=lambda c: c.order)
344
+ return chunks
345
+
346
+ async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
347
+ """Get adjacent chunks before and after the given chunk within the same document."""
348
+ assert chunk.document_id, "Document id is required for adjacent chunk finding"
349
+
350
+ chunk_order = chunk.order
351
+
352
+ # Fetch chunks for the same document and filter by order proximity
353
+ all_chunks = await self.get_by_document_id(chunk.document_id)
354
+
355
+ adjacent_chunks: list[Chunk] = []
356
+ for c in all_chunks:
357
+ c_order = c.order
358
+ if c.id != chunk.id and abs(c_order - chunk_order) <= num_adjacent:
359
+ adjacent_chunks.append(c)
360
+
361
+ return adjacent_chunks
362
+
363
+ async def _process_search_results(
364
+ self, query_result: "pd.DataFrame | LanceQueryBuilder"
365
+ ) -> list[tuple[Chunk, float]]:
366
+ """Process search results into chunks with document info and scores.
367
+
368
+ Args:
369
+ query_result: Either a pandas DataFrame or a LanceDB query result
370
+ """
371
+ import pandas as pd
372
+
373
+ def extract_scores(df: pd.DataFrame) -> list[float]:
374
+ """Extract scores from DataFrame columns based on search type."""
375
+ if "_distance" in df.columns:
376
+ # Vector search - convert distance to similarity
377
+ return ((df["_distance"] + 1).rdiv(1)).clip(lower=0.0).tolist()
378
+ elif "_relevance_score" in df.columns:
379
+ # Hybrid search - relevance score (higher is better)
380
+ return df["_relevance_score"].tolist()
381
+ elif "_score" in df.columns:
382
+ # FTS search - score (higher is better)
383
+ return df["_score"].tolist()
384
+ else:
385
+ raise ValueError("Unknown search result format, cannot extract scores")
386
+
387
+ # Convert everything to DataFrame for uniform processing
388
+ if isinstance(query_result, pd.DataFrame):
389
+ df = query_result
390
+ else:
391
+ # Convert LanceDB query result to DataFrame
392
+ df = query_result.to_pandas()
393
+
394
+ # Extract scores
395
+ scores = extract_scores(df)
396
+
397
+ # Convert DataFrame rows to ChunkRecords
398
+ pydantic_results = [
399
+ self.store.ChunkRecord(
400
+ id=str(row["id"]),
401
+ document_id=str(row["document_id"]),
402
+ content=str(row["content"]),
403
+ metadata=str(row["metadata"]),
404
+ order=int(row["order"]) if "order" in row else 0,
405
+ )
406
+ for _, row in df.iterrows()
407
+ ]
408
+
409
+ # Collect all unique document IDs for batch lookup
410
+ document_ids = list(set(chunk.document_id for chunk in pydantic_results))
411
+
412
+ # Batch fetch all documents at once
413
+ documents_map = {}
414
+ if document_ids:
415
+ # Use IN clause for efficient batch lookup
416
+ id_list = "', '".join(document_ids)
417
+ where_clause = f"id IN ('{id_list}')"
418
+ doc_results = list(
419
+ self.store.documents_table.search()
420
+ .where(where_clause)
421
+ .to_pydantic(DocumentRecord)
422
+ )
423
+ documents_map = {doc.id: doc for doc in doc_results}
424
+
425
+ # Build final results with document info
426
+ chunks_with_scores = []
427
+ for i, chunk_record in enumerate(pydantic_results):
428
+ doc = documents_map.get(chunk_record.document_id)
429
+ chunk = Chunk(
430
+ id=chunk_record.id,
431
+ document_id=chunk_record.document_id,
432
+ content=chunk_record.content,
433
+ metadata=json.loads(chunk_record.metadata),
434
+ order=chunk_record.order,
435
+ document_uri=doc.uri if doc else None,
436
+ document_title=doc.title if doc else None,
437
+ document_meta=json.loads(doc.metadata if doc else "{}"),
438
+ )
439
+ score = scores[i] if i < len(scores) else 1.0
440
+ chunks_with_scores.append((chunk, score))
441
+
442
+ return chunks_with_scores
@@ -0,0 +1,261 @@
1
+ import asyncio
2
+ import json
3
+ from datetime import datetime
4
+ from typing import TYPE_CHECKING
5
+ from uuid import uuid4
6
+
7
+ from haiku.rag.store.engine import DocumentRecord, Store
8
+ from haiku.rag.store.models.document import Document
9
+
10
+ if TYPE_CHECKING:
11
+ from docling_core.types.doc.document import DoclingDocument
12
+
13
+ from haiku.rag.store.models.chunk import Chunk
14
+
15
+
16
+ class DocumentRepository:
17
+ """Repository for Document operations."""
18
+
19
+ def __init__(self, store: Store) -> None:
20
+ self.store = store
21
+ self._chunk_repository = None
22
+
23
+ @property
24
+ def chunk_repository(self):
25
+ """Lazy-load ChunkRepository when needed."""
26
+ if self._chunk_repository is None:
27
+ from haiku.rag.store.repositories.chunk import ChunkRepository
28
+
29
+ self._chunk_repository = ChunkRepository(self.store)
30
+ return self._chunk_repository
31
+
32
+ def _record_to_document(self, record: DocumentRecord) -> Document:
33
+ """Convert a DocumentRecord to a Document model."""
34
+ return Document(
35
+ id=record.id,
36
+ content=record.content,
37
+ uri=record.uri,
38
+ title=record.title,
39
+ metadata=json.loads(record.metadata),
40
+ created_at=datetime.fromisoformat(record.created_at)
41
+ if record.created_at
42
+ else datetime.now(),
43
+ updated_at=datetime.fromisoformat(record.updated_at)
44
+ if record.updated_at
45
+ else datetime.now(),
46
+ )
47
+
48
+ async def create(self, entity: Document) -> Document:
49
+ """Create a document in the database."""
50
+ # Generate new UUID
51
+ doc_id = str(uuid4())
52
+
53
+ # Create timestamp
54
+ now = datetime.now().isoformat()
55
+
56
+ # Create document record
57
+ doc_record = DocumentRecord(
58
+ id=doc_id,
59
+ content=entity.content,
60
+ uri=entity.uri,
61
+ title=entity.title,
62
+ metadata=json.dumps(entity.metadata),
63
+ created_at=now,
64
+ updated_at=now,
65
+ )
66
+
67
+ # Add to table
68
+ self.store.documents_table.add([doc_record])
69
+
70
+ entity.id = doc_id
71
+ entity.created_at = datetime.fromisoformat(now)
72
+ entity.updated_at = datetime.fromisoformat(now)
73
+ return entity
74
+
75
+ async def get_by_id(self, entity_id: str) -> Document | None:
76
+ """Get a document by its ID."""
77
+ results = list(
78
+ self.store.documents_table.search()
79
+ .where(f"id = '{entity_id}'")
80
+ .limit(1)
81
+ .to_pydantic(DocumentRecord)
82
+ )
83
+
84
+ if not results:
85
+ return None
86
+
87
+ return self._record_to_document(results[0])
88
+
89
+ async def update(self, entity: Document) -> Document:
90
+ """Update an existing document."""
91
+ assert entity.id, "Document ID is required for update"
92
+
93
+ # Update timestamp
94
+ now = datetime.now().isoformat()
95
+ entity.updated_at = datetime.fromisoformat(now)
96
+
97
+ # Update the record
98
+ self.store.documents_table.update(
99
+ where=f"id = '{entity.id}'",
100
+ values={
101
+ "content": entity.content,
102
+ "uri": entity.uri,
103
+ "title": entity.title,
104
+ "metadata": json.dumps(entity.metadata),
105
+ "updated_at": now,
106
+ },
107
+ )
108
+
109
+ return entity
110
+
111
+ async def delete(self, entity_id: str) -> bool:
112
+ """Delete a document by its ID."""
113
+ # Check if document exists
114
+ doc = await self.get_by_id(entity_id)
115
+ if doc is None:
116
+ return False
117
+
118
+ # Delete associated chunks first
119
+ await self.chunk_repository.delete_by_document_id(entity_id)
120
+
121
+ # Delete the document
122
+ self.store.documents_table.delete(f"id = '{entity_id}'")
123
+ return True
124
+
125
+ async def list_all(
126
+ self,
127
+ limit: int | None = None,
128
+ offset: int | None = None,
129
+ filter: str | None = None,
130
+ ) -> list[Document]:
131
+ """List all documents with optional pagination and filtering.
132
+
133
+ Args:
134
+ limit: Maximum number of documents to return.
135
+ offset: Number of documents to skip.
136
+ filter: Optional SQL WHERE clause to filter documents.
137
+
138
+ Returns:
139
+ List of Document instances matching the criteria.
140
+ """
141
+ query = self.store.documents_table.search()
142
+
143
+ if filter is not None:
144
+ query = query.where(filter)
145
+ if offset is not None:
146
+ query = query.offset(offset)
147
+ if limit is not None:
148
+ query = query.limit(limit)
149
+
150
+ results = list(query.to_pydantic(DocumentRecord))
151
+ return [self._record_to_document(doc) for doc in results]
152
+
153
+ async def get_by_uri(self, uri: str) -> Document | None:
154
+ """Get a document by its URI."""
155
+ results = list(
156
+ self.store.documents_table.search()
157
+ .where(f"uri = '{uri}'")
158
+ .limit(1)
159
+ .to_pydantic(DocumentRecord)
160
+ )
161
+
162
+ if not results:
163
+ return None
164
+
165
+ return self._record_to_document(results[0])
166
+
167
+ async def delete_all(self) -> None:
168
+ """Delete all documents from the database."""
169
+ # Delete all chunks first
170
+ await self.chunk_repository.delete_all()
171
+
172
+ # Get count before deletion
173
+ count = len(
174
+ list(
175
+ self.store.documents_table.search().limit(1).to_pydantic(DocumentRecord)
176
+ )
177
+ )
178
+ if count > 0:
179
+ # Drop and recreate table to clear all data
180
+ self.store.db.drop_table("documents")
181
+ self.store.documents_table = self.store.db.create_table(
182
+ "documents", schema=DocumentRecord
183
+ )
184
+
185
+ async def _create_and_chunk(
186
+ self,
187
+ entity: Document,
188
+ docling_document: "DoclingDocument | None",
189
+ chunks: list["Chunk"] | None = None,
190
+ ) -> Document:
191
+ """Create a document with its chunks and embeddings."""
192
+ # Snapshot table versions for versioned rollback (if supported)
193
+ versions = self.store.current_table_versions()
194
+
195
+ # Create the document
196
+ created_doc = await self.create(entity)
197
+
198
+ # Attempt to create chunks; on failure, prefer version rollback
199
+ try:
200
+ # Create chunks if not provided
201
+ if chunks is None:
202
+ assert docling_document is not None, (
203
+ "docling_document is required when chunks are not provided"
204
+ )
205
+ assert created_doc.id is not None, (
206
+ "Document ID should not be None after creation"
207
+ )
208
+ await self.chunk_repository.create_chunks_for_document(
209
+ created_doc.id, docling_document
210
+ )
211
+ else:
212
+ # Use provided chunks, set order from list position
213
+ assert created_doc.id is not None, (
214
+ "Document ID should not be None after creation"
215
+ )
216
+ for order, chunk in enumerate(chunks):
217
+ chunk.document_id = created_doc.id
218
+ chunk.order = order
219
+ await self.chunk_repository.create(chunk)
220
+
221
+ # Vacuum old versions in background (non-blocking)
222
+ asyncio.create_task(self.store.vacuum())
223
+
224
+ return created_doc
225
+ except Exception:
226
+ # Roll back to the captured versions and re-raise
227
+ self.store.restore_table_versions(versions)
228
+ raise
229
+
230
+ async def _update_and_rechunk(
231
+ self, entity: Document, docling_document: "DoclingDocument"
232
+ ) -> Document:
233
+ """Update a document and regenerate its chunks."""
234
+ assert entity.id is not None, "Document ID is required for update"
235
+
236
+ # Snapshot table versions for versioned rollback
237
+ versions = self.store.current_table_versions()
238
+
239
+ # Delete existing chunks before writing new ones
240
+ await self.chunk_repository.delete_by_document_id(entity.id)
241
+
242
+ try:
243
+ # Update the document
244
+ updated_doc = await self.update(entity)
245
+
246
+ # Create new chunks
247
+ assert updated_doc.id is not None, (
248
+ "Document ID should not be None after update"
249
+ )
250
+ await self.chunk_repository.create_chunks_for_document(
251
+ updated_doc.id, docling_document
252
+ )
253
+
254
+ # Vacuum old versions in background (non-blocking)
255
+ asyncio.create_task(self.store.vacuum())
256
+
257
+ return updated_doc
258
+ except Exception:
259
+ # Roll back to the captured versions and re-raise
260
+ self.store.restore_table_versions(versions)
261
+ raise