haiku.rag 0.10.2__py3-none-any.whl → 0.19.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. README.md +172 -0
  2. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/METADATA +79 -51
  3. haiku_rag-0.19.3.dist-info/RECORD +6 -0
  4. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/WHEEL +1 -1
  5. haiku/rag/__init__.py +0 -0
  6. haiku/rag/app.py +0 -437
  7. haiku/rag/chunker.py +0 -51
  8. haiku/rag/cli.py +0 -466
  9. haiku/rag/client.py +0 -605
  10. haiku/rag/config.py +0 -81
  11. haiku/rag/embeddings/__init__.py +0 -35
  12. haiku/rag/embeddings/base.py +0 -15
  13. haiku/rag/embeddings/ollama.py +0 -17
  14. haiku/rag/embeddings/openai.py +0 -16
  15. haiku/rag/embeddings/vllm.py +0 -19
  16. haiku/rag/embeddings/voyageai.py +0 -17
  17. haiku/rag/logging.py +0 -56
  18. haiku/rag/mcp.py +0 -156
  19. haiku/rag/migration.py +0 -316
  20. haiku/rag/monitor.py +0 -73
  21. haiku/rag/qa/__init__.py +0 -15
  22. haiku/rag/qa/agent.py +0 -91
  23. haiku/rag/qa/prompts.py +0 -60
  24. haiku/rag/reader.py +0 -115
  25. haiku/rag/reranking/__init__.py +0 -34
  26. haiku/rag/reranking/base.py +0 -13
  27. haiku/rag/reranking/cohere.py +0 -34
  28. haiku/rag/reranking/mxbai.py +0 -28
  29. haiku/rag/reranking/vllm.py +0 -44
  30. haiku/rag/research/__init__.py +0 -20
  31. haiku/rag/research/common.py +0 -53
  32. haiku/rag/research/dependencies.py +0 -47
  33. haiku/rag/research/graph.py +0 -29
  34. haiku/rag/research/models.py +0 -70
  35. haiku/rag/research/nodes/evaluate.py +0 -80
  36. haiku/rag/research/nodes/plan.py +0 -63
  37. haiku/rag/research/nodes/search.py +0 -93
  38. haiku/rag/research/nodes/synthesize.py +0 -51
  39. haiku/rag/research/prompts.py +0 -114
  40. haiku/rag/research/state.py +0 -25
  41. haiku/rag/store/__init__.py +0 -4
  42. haiku/rag/store/engine.py +0 -269
  43. haiku/rag/store/models/__init__.py +0 -4
  44. haiku/rag/store/models/chunk.py +0 -17
  45. haiku/rag/store/models/document.py +0 -17
  46. haiku/rag/store/repositories/__init__.py +0 -9
  47. haiku/rag/store/repositories/chunk.py +0 -424
  48. haiku/rag/store/repositories/document.py +0 -237
  49. haiku/rag/store/repositories/settings.py +0 -155
  50. haiku/rag/store/upgrades/__init__.py +0 -62
  51. haiku/rag/store/upgrades/v0_10_1.py +0 -64
  52. haiku/rag/store/upgrades/v0_9_3.py +0 -112
  53. haiku/rag/utils.py +0 -199
  54. haiku_rag-0.10.2.dist-info/RECORD +0 -54
  55. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/entry_points.txt +0 -0
  56. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,424 +0,0 @@
1
- import asyncio
2
- import inspect
3
- import json
4
- import logging
5
- from uuid import uuid4
6
-
7
- from docling_core.types.doc.document import DoclingDocument
8
- from lancedb.rerankers import RRFReranker
9
-
10
- from haiku.rag.chunker import chunker
11
- from haiku.rag.config import Config
12
- from haiku.rag.embeddings import get_embedder
13
- from haiku.rag.store.engine import DocumentRecord, Store
14
- from haiku.rag.store.models.chunk import Chunk
15
- from haiku.rag.utils import load_callable, text_to_docling_document
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- class ChunkRepository:
21
- """Repository for Chunk operations."""
22
-
23
- def __init__(self, store: Store) -> None:
24
- self.store = store
25
- self.embedder = get_embedder()
26
- self._optimize_lock = asyncio.Lock()
27
-
28
- def _ensure_fts_index(self) -> None:
29
- """Ensure FTS index exists on the content column."""
30
- try:
31
- self.store.chunks_table.create_fts_index(
32
- "content", replace=True, with_position=True, remove_stop_words=False
33
- )
34
- except Exception as e:
35
- # Log the error but don't fail - FTS might already exist
36
- logger.debug(f"FTS index creation skipped: {e}")
37
-
38
- async def _optimize(self) -> None:
39
- """Optimize the chunks table to refresh indexes."""
40
- # Skip optimization for LanceDB Cloud as it handles this automatically
41
- if Config.LANCEDB_URI and Config.LANCEDB_URI.startswith("db://"):
42
- return
43
-
44
- async with self._optimize_lock:
45
- try:
46
- self.store.chunks_table.optimize()
47
- except (RuntimeError, OSError) as e:
48
- # Handle "too many open files" and other resource errors gracefully
49
- logger.debug(
50
- f"Table optimization skipped due to resource constraints: {e}"
51
- )
52
-
53
- async def create(self, entity: Chunk) -> Chunk:
54
- """Create a chunk in the database."""
55
- assert entity.document_id, "Chunk must have a document_id to be created"
56
-
57
- chunk_id = str(uuid4())
58
-
59
- # Generate embedding if not provided
60
- if entity.embedding is not None:
61
- embedding = entity.embedding
62
- else:
63
- embedding = await self.embedder.embed(entity.content)
64
- order_val = int(entity.order)
65
-
66
- chunk_record = self.store.ChunkRecord(
67
- id=chunk_id,
68
- document_id=entity.document_id,
69
- content=entity.content,
70
- metadata=json.dumps(
71
- {k: v for k, v in entity.metadata.items() if k != "order"}
72
- ),
73
- order=order_val,
74
- vector=embedding,
75
- )
76
-
77
- self.store.chunks_table.add([chunk_record])
78
-
79
- entity.id = chunk_id
80
-
81
- # Try to optimize if not currently locked (non-blocking)
82
- if not self._optimize_lock.locked():
83
- asyncio.create_task(self._optimize())
84
-
85
- return entity
86
-
87
- async def get_by_id(self, entity_id: str) -> Chunk | None:
88
- """Get a chunk by its ID."""
89
- results = list(
90
- self.store.chunks_table.search()
91
- .where(f"id = '{entity_id}'")
92
- .limit(1)
93
- .to_pydantic(self.store.ChunkRecord)
94
- )
95
-
96
- if not results:
97
- return None
98
-
99
- chunk_record = results[0]
100
- md = json.loads(chunk_record.metadata)
101
- return Chunk(
102
- id=chunk_record.id,
103
- document_id=chunk_record.document_id,
104
- content=chunk_record.content,
105
- metadata=md,
106
- order=chunk_record.order,
107
- )
108
-
109
- async def update(self, entity: Chunk) -> Chunk:
110
- """Update an existing chunk."""
111
- assert entity.id, "Chunk ID is required for update"
112
-
113
- embedding = await self.embedder.embed(entity.content)
114
- order_val = int(entity.order)
115
-
116
- self.store.chunks_table.update(
117
- where=f"id = '{entity.id}'",
118
- values={
119
- "document_id": entity.document_id,
120
- "content": entity.content,
121
- "metadata": json.dumps(
122
- {k: v for k, v in entity.metadata.items() if k != "order"}
123
- ),
124
- "order": order_val,
125
- "vector": embedding,
126
- },
127
- )
128
- # Try to optimize if not currently locked (non-blocking)
129
- if not self._optimize_lock.locked():
130
- asyncio.create_task(self._optimize())
131
-
132
- return entity
133
-
134
- async def delete(self, entity_id: str) -> bool:
135
- """Delete a chunk by its ID."""
136
- chunk = await self.get_by_id(entity_id)
137
- if chunk is None:
138
- return False
139
-
140
- self.store.chunks_table.delete(f"id = '{entity_id}'")
141
- return True
142
-
143
- async def list_all(
144
- self, limit: int | None = None, offset: int | None = None
145
- ) -> list[Chunk]:
146
- """List all chunks with optional pagination."""
147
- query = self.store.chunks_table.search()
148
-
149
- if offset is not None:
150
- query = query.offset(offset)
151
- if limit is not None:
152
- query = query.limit(limit)
153
-
154
- results = list(query.to_pydantic(self.store.ChunkRecord))
155
-
156
- chunks: list[Chunk] = []
157
- for rec in results:
158
- md = json.loads(rec.metadata)
159
- chunks.append(
160
- Chunk(
161
- id=rec.id,
162
- document_id=rec.document_id,
163
- content=rec.content,
164
- metadata=md,
165
- order=rec.order,
166
- )
167
- )
168
- return chunks
169
-
170
- async def create_chunks_for_document(
171
- self, document_id: str, document: DoclingDocument
172
- ) -> list[Chunk]:
173
- """Create chunks and embeddings for a document from DoclingDocument."""
174
- # Optionally preprocess markdown before chunking
175
- processed_document = document
176
- preprocessor_path = Config.MARKDOWN_PREPROCESSOR
177
- if preprocessor_path:
178
- try:
179
- pre_fn = load_callable(preprocessor_path)
180
- markdown = document.export_to_markdown()
181
- result = pre_fn(markdown)
182
- if inspect.isawaitable(result):
183
- result = await result # type: ignore[assignment]
184
- processed_markdown = result
185
- if not isinstance(processed_markdown, str):
186
- raise ValueError("Preprocessor must return a markdown string")
187
- processed_document = text_to_docling_document(
188
- processed_markdown, name="content.md"
189
- )
190
- except Exception as e:
191
- logger.error(
192
- f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
193
- )
194
- raise e
195
-
196
- chunk_texts = await chunker.chunk(processed_document)
197
-
198
- embeddings = await self.embedder.embed(chunk_texts)
199
-
200
- # Prepare all chunk records for batch insertion
201
- chunk_records = []
202
- created_chunks = []
203
-
204
- for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
205
- chunk_id = str(uuid4())
206
-
207
- chunk_record = self.store.ChunkRecord(
208
- id=chunk_id,
209
- document_id=document_id,
210
- content=chunk_text,
211
- metadata=json.dumps({}),
212
- order=order,
213
- vector=embedding,
214
- )
215
- chunk_records.append(chunk_record)
216
-
217
- chunk = Chunk(
218
- id=chunk_id,
219
- document_id=document_id,
220
- content=chunk_text,
221
- metadata={},
222
- order=order,
223
- )
224
- created_chunks.append(chunk)
225
-
226
- # Batch insert all chunks at once
227
- if chunk_records:
228
- self.store.chunks_table.add(chunk_records)
229
-
230
- # Force optimization once at the end for bulk operations
231
- await self._optimize()
232
- return created_chunks
233
-
234
- async def delete_all(self) -> None:
235
- """Delete all chunks from the database."""
236
- # Drop and recreate table to clear all data
237
- self.store.db.drop_table("chunks")
238
- self.store.chunks_table = self.store.db.create_table(
239
- "chunks", schema=self.store.ChunkRecord
240
- )
241
- # Create FTS index on the new table with phrase query support
242
- self.store.chunks_table.create_fts_index(
243
- "content", replace=True, with_position=True, remove_stop_words=False
244
- )
245
-
246
- async def delete_by_document_id(self, document_id: str) -> bool:
247
- """Delete all chunks for a document."""
248
- chunks = await self.get_by_document_id(document_id)
249
-
250
- if not chunks:
251
- return False
252
-
253
- self.store.chunks_table.delete(f"document_id = '{document_id}'")
254
- return True
255
-
256
- async def search(
257
- self, query: str, limit: int = 5, search_type: str = "hybrid"
258
- ) -> list[tuple[Chunk, float]]:
259
- """Search for relevant chunks using the specified search method.
260
-
261
- Args:
262
- query: The search query string.
263
- limit: Maximum number of results to return.
264
- search_type: Type of search - "vector", "fts", or "hybrid" (default).
265
-
266
- Returns:
267
- List of (chunk, score) tuples ordered by relevance.
268
- """
269
- if not query.strip():
270
- return []
271
-
272
- if search_type == "vector":
273
- query_embedding = await self.embedder.embed(query)
274
-
275
- results = self.store.chunks_table.search(
276
- query_embedding, query_type="vector", vector_column_name="vector"
277
- ).limit(limit)
278
-
279
- return await self._process_search_results(results)
280
-
281
- elif search_type == "fts":
282
- results = self.store.chunks_table.search(query, query_type="fts").limit(
283
- limit
284
- )
285
- return await self._process_search_results(results)
286
-
287
- else: # hybrid (default)
288
- query_embedding = await self.embedder.embed(query)
289
-
290
- # Create RRF reranker
291
- reranker = RRFReranker()
292
-
293
- # Perform native hybrid search with RRF reranking
294
- results = (
295
- self.store.chunks_table.search(query_type="hybrid")
296
- .vector(query_embedding)
297
- .text(query)
298
- .rerank(reranker)
299
- .limit(limit)
300
- )
301
- return await self._process_search_results(results)
302
-
303
- async def get_by_document_id(self, document_id: str) -> list[Chunk]:
304
- """Get all chunks for a specific document."""
305
- results = list(
306
- self.store.chunks_table.search()
307
- .where(f"document_id = '{document_id}'")
308
- .to_pydantic(self.store.ChunkRecord)
309
- )
310
-
311
- # Get document info
312
- doc_results = list(
313
- self.store.documents_table.search()
314
- .where(f"id = '{document_id}'")
315
- .limit(1)
316
- .to_pydantic(DocumentRecord)
317
- )
318
-
319
- doc_uri = doc_results[0].uri if doc_results else None
320
- doc_title = doc_results[0].title if doc_results else None
321
- doc_meta = doc_results[0].metadata if doc_results else "{}"
322
-
323
- chunks: list[Chunk] = []
324
- for rec in results:
325
- md = json.loads(rec.metadata)
326
- chunks.append(
327
- Chunk(
328
- id=rec.id,
329
- document_id=rec.document_id,
330
- content=rec.content,
331
- metadata=md,
332
- order=rec.order,
333
- document_uri=doc_uri,
334
- document_title=doc_title,
335
- document_meta=json.loads(doc_meta),
336
- )
337
- )
338
-
339
- chunks.sort(key=lambda c: c.order)
340
- return chunks
341
-
342
- async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
343
- """Get adjacent chunks before and after the given chunk within the same document."""
344
- assert chunk.document_id, "Document id is required for adjacent chunk finding"
345
-
346
- chunk_order = chunk.order
347
-
348
- # Fetch chunks for the same document and filter by order proximity
349
- all_chunks = await self.get_by_document_id(chunk.document_id)
350
-
351
- adjacent_chunks: list[Chunk] = []
352
- for c in all_chunks:
353
- c_order = c.order
354
- if c.id != chunk.id and abs(c_order - chunk_order) <= num_adjacent:
355
- adjacent_chunks.append(c)
356
-
357
- return adjacent_chunks
358
-
359
- async def _process_search_results(self, query_result) -> list[tuple[Chunk, float]]:
360
- """Process search results into chunks with document info and scores."""
361
- chunks_with_scores = []
362
-
363
- # Get both arrow and pydantic results to access scores
364
- arrow_result = query_result.to_arrow()
365
- pydantic_results = list(query_result.to_pydantic(self.store.ChunkRecord))
366
-
367
- # Extract scores from arrow result based on search type
368
- scores = []
369
- column_names = arrow_result.column_names
370
-
371
- if "_distance" in column_names:
372
- # Vector search - distance (lower is better, convert to similarity)
373
- distances = arrow_result.column("_distance").to_pylist()
374
- scores = [max(0.0, 1.0 / (1.0 + dist)) for dist in distances]
375
- elif "_relevance_score" in column_names:
376
- # Hybrid search - relevance score (higher is better)
377
- scores = arrow_result.column("_relevance_score").to_pylist()
378
- elif "_score" in column_names:
379
- # FTS search - score (higher is better)
380
- scores = arrow_result.column("_score").to_pylist()
381
- else:
382
- raise ValueError("Unknown search result format, cannot extract scores")
383
-
384
- # Collect all unique document IDs for batch lookup
385
- document_ids = list(set(chunk.document_id for chunk in pydantic_results))
386
-
387
- # Batch fetch all documents at once
388
- documents_map = {}
389
- if document_ids:
390
- # Create a WHERE clause for all document IDs
391
- where_clause = " OR ".join(f"id = '{doc_id}'" for doc_id in document_ids)
392
- doc_results = list(
393
- self.store.documents_table.search()
394
- .where(where_clause)
395
- .to_pydantic(DocumentRecord)
396
- )
397
- documents_map = {doc.id: doc for doc in doc_results}
398
-
399
- for i, chunk_record in enumerate(pydantic_results):
400
- # Get document info from pre-fetched map
401
- doc = documents_map.get(chunk_record.document_id)
402
- doc_uri = doc.uri if doc else None
403
- doc_title = doc.title if doc else None
404
- doc_meta = doc.metadata if doc else "{}"
405
-
406
- md = json.loads(chunk_record.metadata)
407
-
408
- chunk = Chunk(
409
- id=chunk_record.id,
410
- document_id=chunk_record.document_id,
411
- content=chunk_record.content,
412
- metadata=md,
413
- order=chunk_record.order,
414
- document_uri=doc_uri,
415
- document_title=doc_title,
416
- document_meta=json.loads(doc_meta),
417
- )
418
-
419
- # Get score from arrow result
420
- score = scores[i] if i < len(scores) else 1.0
421
-
422
- chunks_with_scores.append((chunk, score))
423
-
424
- return chunks_with_scores
@@ -1,237 +0,0 @@
1
- import json
2
- from datetime import datetime
3
- from typing import TYPE_CHECKING
4
- from uuid import uuid4
5
-
6
- from docling_core.types.doc.document import DoclingDocument
7
-
8
- from haiku.rag.store.engine import DocumentRecord, Store
9
- from haiku.rag.store.models.document import Document
10
-
11
- if TYPE_CHECKING:
12
- from haiku.rag.store.models.chunk import Chunk
13
-
14
-
15
- class DocumentRepository:
16
- """Repository for Document operations."""
17
-
18
- def __init__(self, store: Store) -> None:
19
- self.store = store
20
- self._chunk_repository = None
21
-
22
- @property
23
- def chunk_repository(self):
24
- """Lazy-load ChunkRepository when needed."""
25
- if self._chunk_repository is None:
26
- from haiku.rag.store.repositories.chunk import ChunkRepository
27
-
28
- self._chunk_repository = ChunkRepository(self.store)
29
- return self._chunk_repository
30
-
31
- def _record_to_document(self, record: DocumentRecord) -> Document:
32
- """Convert a DocumentRecord to a Document model."""
33
- return Document(
34
- id=record.id,
35
- content=record.content,
36
- uri=record.uri,
37
- title=record.title,
38
- metadata=json.loads(record.metadata),
39
- created_at=datetime.fromisoformat(record.created_at)
40
- if record.created_at
41
- else datetime.now(),
42
- updated_at=datetime.fromisoformat(record.updated_at)
43
- if record.updated_at
44
- else datetime.now(),
45
- )
46
-
47
- async def create(self, entity: Document) -> Document:
48
- """Create a document in the database."""
49
- # Generate new UUID
50
- doc_id = str(uuid4())
51
-
52
- # Create timestamp
53
- now = datetime.now().isoformat()
54
-
55
- # Create document record
56
- doc_record = DocumentRecord(
57
- id=doc_id,
58
- content=entity.content,
59
- uri=entity.uri,
60
- title=entity.title,
61
- metadata=json.dumps(entity.metadata),
62
- created_at=now,
63
- updated_at=now,
64
- )
65
-
66
- # Add to table
67
- self.store.documents_table.add([doc_record])
68
-
69
- entity.id = doc_id
70
- entity.created_at = datetime.fromisoformat(now)
71
- entity.updated_at = datetime.fromisoformat(now)
72
- return entity
73
-
74
- async def get_by_id(self, entity_id: str) -> Document | None:
75
- """Get a document by its ID."""
76
- results = list(
77
- self.store.documents_table.search()
78
- .where(f"id = '{entity_id}'")
79
- .limit(1)
80
- .to_pydantic(DocumentRecord)
81
- )
82
-
83
- if not results:
84
- return None
85
-
86
- return self._record_to_document(results[0])
87
-
88
- async def update(self, entity: Document) -> Document:
89
- """Update an existing document."""
90
- assert entity.id, "Document ID is required for update"
91
-
92
- # Update timestamp
93
- now = datetime.now().isoformat()
94
- entity.updated_at = datetime.fromisoformat(now)
95
-
96
- # Update the record
97
- self.store.documents_table.update(
98
- where=f"id = '{entity.id}'",
99
- values={
100
- "content": entity.content,
101
- "uri": entity.uri,
102
- "title": entity.title,
103
- "metadata": json.dumps(entity.metadata),
104
- "updated_at": now,
105
- },
106
- )
107
-
108
- return entity
109
-
110
- async def delete(self, entity_id: str) -> bool:
111
- """Delete a document by its ID."""
112
- # Check if document exists
113
- doc = await self.get_by_id(entity_id)
114
- if doc is None:
115
- return False
116
-
117
- # Delete associated chunks first
118
- await self.chunk_repository.delete_by_document_id(entity_id)
119
-
120
- # Delete the document
121
- self.store.documents_table.delete(f"id = '{entity_id}'")
122
- return True
123
-
124
- async def list_all(
125
- self, limit: int | None = None, offset: int | None = None
126
- ) -> list[Document]:
127
- """List all documents with optional pagination."""
128
- query = self.store.documents_table.search()
129
-
130
- if offset is not None:
131
- query = query.offset(offset)
132
- if limit is not None:
133
- query = query.limit(limit)
134
-
135
- results = list(query.to_pydantic(DocumentRecord))
136
- return [self._record_to_document(doc) for doc in results]
137
-
138
- async def get_by_uri(self, uri: str) -> Document | None:
139
- """Get a document by its URI."""
140
- results = list(
141
- self.store.documents_table.search()
142
- .where(f"uri = '{uri}'")
143
- .limit(1)
144
- .to_pydantic(DocumentRecord)
145
- )
146
-
147
- if not results:
148
- return None
149
-
150
- return self._record_to_document(results[0])
151
-
152
- async def delete_all(self) -> None:
153
- """Delete all documents from the database."""
154
- # Delete all chunks first
155
- await self.chunk_repository.delete_all()
156
-
157
- # Get count before deletion
158
- count = len(
159
- list(
160
- self.store.documents_table.search().limit(1).to_pydantic(DocumentRecord)
161
- )
162
- )
163
- if count > 0:
164
- # Drop and recreate table to clear all data
165
- self.store.db.drop_table("documents")
166
- self.store.documents_table = self.store.db.create_table(
167
- "documents", schema=DocumentRecord
168
- )
169
-
170
- async def _create_with_docling(
171
- self,
172
- entity: Document,
173
- docling_document: DoclingDocument,
174
- chunks: list["Chunk"] | None = None,
175
- ) -> Document:
176
- """Create a document with its chunks and embeddings."""
177
- # Snapshot table versions for versioned rollback (if supported)
178
- versions = self.store.current_table_versions()
179
-
180
- # Create the document
181
- created_doc = await self.create(entity)
182
-
183
- # Attempt to create chunks; on failure, prefer version rollback
184
- try:
185
- # Create chunks if not provided
186
- if chunks is None:
187
- assert created_doc.id is not None, (
188
- "Document ID should not be None after creation"
189
- )
190
- await self.chunk_repository.create_chunks_for_document(
191
- created_doc.id, docling_document
192
- )
193
- else:
194
- # Use provided chunks, set order from list position
195
- assert created_doc.id is not None, (
196
- "Document ID should not be None after creation"
197
- )
198
- for order, chunk in enumerate(chunks):
199
- chunk.document_id = created_doc.id
200
- chunk.order = order
201
- await self.chunk_repository.create(chunk)
202
-
203
- return created_doc
204
- except Exception:
205
- # Roll back to the captured versions and re-raise
206
- self.store.restore_table_versions(versions)
207
- raise
208
-
209
- async def _update_with_docling(
210
- self, entity: Document, docling_document: DoclingDocument
211
- ) -> Document:
212
- """Update a document and regenerate its chunks."""
213
- assert entity.id is not None, "Document ID is required for update"
214
-
215
- # Snapshot table versions for versioned rollback
216
- versions = self.store.current_table_versions()
217
-
218
- # Delete existing chunks before writing new ones
219
- await self.chunk_repository.delete_by_document_id(entity.id)
220
-
221
- try:
222
- # Update the document
223
- updated_doc = await self.update(entity)
224
-
225
- # Create new chunks
226
- assert updated_doc.id is not None, (
227
- "Document ID should not be None after update"
228
- )
229
- await self.chunk_repository.create_chunks_for_document(
230
- updated_doc.id, docling_document
231
- )
232
-
233
- return updated_doc
234
- except Exception:
235
- # Roll back to the captured versions and re-raise
236
- self.store.restore_table_versions(versions)
237
- raise