haiku.rag 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

@@ -1,248 +1,214 @@
1
1
  import json
2
+ from datetime import datetime
2
3
  from typing import TYPE_CHECKING
4
+ from uuid import uuid4
3
5
 
4
6
  from docling_core.types.doc.document import DoclingDocument
5
7
 
8
+ from haiku.rag.store.engine import DocumentRecord, Store
6
9
  from haiku.rag.store.models.document import Document
7
- from haiku.rag.store.repositories.base import BaseRepository
8
- from haiku.rag.utils import text_to_docling_document
9
10
 
10
11
  if TYPE_CHECKING:
11
12
  from haiku.rag.store.models.chunk import Chunk
12
13
 
13
14
 
14
- class DocumentRepository(BaseRepository[Document]):
15
- """Repository for Document database operations."""
15
+ class DocumentRepository:
16
+ """Repository for Document operations."""
16
17
 
17
- def __init__(self, store, chunk_repository=None):
18
- super().__init__(store)
19
- # Avoid circular import by using late import if not provided
20
- if chunk_repository is None:
18
+ def __init__(self, store: Store) -> None:
19
+ self.store = store
20
+ self._chunk_repository = None
21
+
22
+ @property
23
+ def chunk_repository(self):
24
+ """Lazy-load ChunkRepository when needed."""
25
+ if self._chunk_repository is None:
21
26
  from haiku.rag.store.repositories.chunk import ChunkRepository
22
27
 
23
- chunk_repository = ChunkRepository(store)
24
- self.chunk_repository = chunk_repository
28
+ self._chunk_repository = ChunkRepository(self.store)
29
+ return self._chunk_repository
25
30
 
26
- async def _create_with_docling(
27
- self,
28
- entity: Document,
29
- docling_document: DoclingDocument,
30
- chunks: list["Chunk"] | None = None,
31
- ) -> Document:
32
- """Create a document with its chunks and embeddings."""
33
- if self.store._connection is None:
34
- raise ValueError("Store connection is not available")
35
-
36
- cursor = self.store._connection.cursor()
37
-
38
- # Start transaction
39
- cursor.execute("BEGIN TRANSACTION")
40
-
41
- try:
42
- # Insert the document
43
- cursor.execute(
44
- """
45
- INSERT INTO documents (content, uri, metadata, created_at, updated_at)
46
- VALUES (:content, :uri, :metadata, :created_at, :updated_at)
47
- """,
48
- {
49
- "content": entity.content,
50
- "uri": entity.uri,
51
- "metadata": json.dumps(entity.metadata),
52
- "created_at": entity.created_at,
53
- "updated_at": entity.updated_at,
54
- },
55
- )
56
-
57
- document_id = cursor.lastrowid
58
- assert document_id is not None, "Failed to create document in database"
59
- entity.id = document_id
60
-
61
- # Create chunks - either use provided chunks or generate from content
62
- if chunks is not None:
63
- # Use provided chunks, but update their document_id and set order from list position
64
- for order, chunk in enumerate(chunks):
65
- chunk.document_id = document_id
66
- # Ensure order is set from list position
67
- chunk.metadata = chunk.metadata.copy() if chunk.metadata else {}
68
- chunk.metadata["order"] = order
69
- await self.chunk_repository.create(chunk, commit=False)
70
- else:
71
- # Create chunks and embeddings using DoclingDocument
72
- await self.chunk_repository.create_chunks_for_document(
73
- document_id, docling_document, commit=False
74
- )
75
-
76
- cursor.execute("COMMIT")
77
- return entity
78
-
79
- except Exception:
80
- cursor.execute("ROLLBACK")
81
- raise
31
+ def _record_to_document(self, record: DocumentRecord) -> Document:
32
+ """Convert a DocumentRecord to a Document model."""
33
+ return Document(
34
+ id=record.id,
35
+ content=record.content,
36
+ uri=record.uri,
37
+ metadata=json.loads(record.metadata) if record.metadata else {},
38
+ created_at=datetime.fromisoformat(record.created_at)
39
+ if record.created_at
40
+ else datetime.now(),
41
+ updated_at=datetime.fromisoformat(record.updated_at)
42
+ if record.updated_at
43
+ else datetime.now(),
44
+ )
82
45
 
83
46
  async def create(self, entity: Document) -> Document:
84
- """Create a document with its chunks and embeddings."""
85
- # Convert content to DoclingDocument
86
- docling_document = text_to_docling_document(entity.content)
47
+ """Create a document in the database."""
48
+ # Generate new UUID
49
+ doc_id = str(uuid4())
50
+
51
+ # Create timestamp
52
+ now = datetime.now().isoformat()
53
+
54
+ # Create document record
55
+ doc_record = DocumentRecord(
56
+ id=doc_id,
57
+ content=entity.content,
58
+ uri=entity.uri,
59
+ metadata=json.dumps(entity.metadata),
60
+ created_at=now,
61
+ updated_at=now,
62
+ )
63
+
64
+ # Add to table
65
+ self.store.documents_table.add([doc_record])
87
66
 
88
- return await self._create_with_docling(entity, docling_document)
67
+ entity.id = doc_id
68
+ entity.created_at = datetime.fromisoformat(now)
69
+ entity.updated_at = datetime.fromisoformat(now)
70
+ return entity
89
71
 
90
- async def get_by_id(self, entity_id: int) -> Document | None:
72
+ async def get_by_id(self, entity_id: str) -> Document | None:
91
73
  """Get a document by its ID."""
92
- if self.store._connection is None:
93
- raise ValueError("Store connection is not available")
94
-
95
- cursor = self.store._connection.cursor()
96
- cursor.execute(
97
- """
98
- SELECT id, content, uri, metadata, created_at, updated_at
99
- FROM documents WHERE id = :id
100
- """,
101
- {"id": entity_id},
74
+ results = list(
75
+ self.store.documents_table.search()
76
+ .where(f"id = '{entity_id}'")
77
+ .limit(1)
78
+ .to_pydantic(DocumentRecord)
102
79
  )
103
80
 
104
- row = cursor.fetchone()
105
- if row is None:
81
+ if not results:
106
82
  return None
107
83
 
108
- document_id, content, uri, metadata_json, created_at, updated_at = row
109
- metadata = json.loads(metadata_json) if metadata_json else {}
110
-
111
- return Document(
112
- id=document_id,
113
- content=content,
114
- uri=uri,
115
- metadata=metadata,
116
- created_at=created_at,
117
- updated_at=updated_at,
118
- )
84
+ return self._record_to_document(results[0])
119
85
 
120
- async def get_by_uri(self, uri: str) -> Document | None:
121
- """Get a document by its URI."""
122
- if self.store._connection is None:
123
- raise ValueError("Store connection is not available")
124
-
125
- cursor = self.store._connection.cursor()
126
- cursor.execute(
127
- """
128
- SELECT id, content, uri, metadata, created_at, updated_at
129
- FROM documents WHERE uri = :uri
130
- """,
131
- {"uri": uri},
86
+ async def update(self, entity: Document) -> Document:
87
+ """Update an existing document."""
88
+ assert entity.id, "Document ID is required for update"
89
+
90
+ # Update timestamp
91
+ now = datetime.now().isoformat()
92
+ entity.updated_at = datetime.fromisoformat(now)
93
+
94
+ # Update the record
95
+ self.store.documents_table.update(
96
+ where=f"id = '{entity.id}'",
97
+ values={
98
+ "content": entity.content,
99
+ "uri": entity.uri,
100
+ "metadata": json.dumps(entity.metadata),
101
+ "updated_at": now,
102
+ },
132
103
  )
133
104
 
134
- row = cursor.fetchone()
135
- if row is None:
136
- return None
105
+ return entity
137
106
 
138
- document_id, content, uri, metadata_json, created_at, updated_at = row
139
- metadata = json.loads(metadata_json) if metadata_json else {}
107
+ async def delete(self, entity_id: str) -> bool:
108
+ """Delete a document by its ID."""
109
+ # Check if document exists
110
+ doc = await self.get_by_id(entity_id)
111
+ if doc is None:
112
+ return False
140
113
 
141
- return Document(
142
- id=document_id,
143
- content=content,
144
- uri=uri,
145
- metadata=metadata,
146
- created_at=created_at,
147
- updated_at=updated_at,
148
- )
114
+ # Delete associated chunks first
115
+ await self.chunk_repository.delete_by_document_id(entity_id)
149
116
 
150
- async def _update_with_docling(
151
- self, entity: Document, docling_document: DoclingDocument
152
- ) -> Document:
153
- """Update an existing document and regenerate its chunks and embeddings."""
154
- if self.store._connection is None:
155
- raise ValueError("Store connection is not available")
156
- if entity.id is None:
157
- raise ValueError("Document ID is required for update")
158
-
159
- cursor = self.store._connection.cursor()
160
-
161
- # Start transaction
162
- cursor.execute("BEGIN TRANSACTION")
163
-
164
- try:
165
- # Update the document
166
- cursor.execute(
167
- """
168
- UPDATE documents
169
- SET content = :content, uri = :uri, metadata = :metadata, updated_at = :updated_at
170
- WHERE id = :id
171
- """,
172
- {
173
- "content": entity.content,
174
- "uri": entity.uri,
175
- "metadata": json.dumps(entity.metadata),
176
- "updated_at": entity.updated_at,
177
- "id": entity.id,
178
- },
179
- )
117
+ # Delete the document
118
+ self.store.documents_table.delete(f"id = '{entity_id}'")
119
+ return True
180
120
 
181
- # Delete existing chunks and regenerate using DoclingDocument
182
- await self.chunk_repository.delete_by_document_id(entity.id, commit=False)
183
- await self.chunk_repository.create_chunks_for_document(
184
- entity.id, docling_document, commit=False
185
- )
121
+ async def list_all(
122
+ self, limit: int | None = None, offset: int | None = None
123
+ ) -> list[Document]:
124
+ """List all documents with optional pagination."""
125
+ query = self.store.documents_table.search()
186
126
 
187
- cursor.execute("COMMIT")
188
- return entity
127
+ if offset is not None:
128
+ query = query.offset(offset)
129
+ if limit is not None:
130
+ query = query.limit(limit)
189
131
 
190
- except Exception:
191
- cursor.execute("ROLLBACK")
192
- raise
132
+ results = list(query.to_pydantic(DocumentRecord))
133
+ return [self._record_to_document(doc) for doc in results]
193
134
 
194
- async def update(self, entity: Document) -> Document:
195
- """Update an existing document and regenerate its chunks and embeddings."""
196
- # Convert content to DoclingDocument
197
- docling_document = text_to_docling_document(entity.content)
135
+ async def get_by_uri(self, uri: str) -> Document | None:
136
+ """Get a document by its URI."""
137
+ results = list(
138
+ self.store.documents_table.search()
139
+ .where(f"uri = '{uri}'")
140
+ .limit(1)
141
+ .to_pydantic(DocumentRecord)
142
+ )
198
143
 
199
- return await self._update_with_docling(entity, docling_document)
144
+ if not results:
145
+ return None
200
146
 
201
- async def delete(self, entity_id: int) -> bool:
202
- """Delete a document and all its associated chunks and embeddings."""
203
- # Delete chunks and embeddings first
204
- await self.chunk_repository.delete_by_document_id(entity_id)
147
+ return self._record_to_document(results[0])
205
148
 
206
- if self.store._connection is None:
207
- raise ValueError("Store connection is not available")
149
+ async def delete_all(self) -> None:
150
+ """Delete all documents from the database."""
151
+ # Delete all chunks first
152
+ await self.chunk_repository.delete_all()
208
153
 
209
- cursor = self.store._connection.cursor()
210
- cursor.execute("DELETE FROM documents WHERE id = :id", {"id": entity_id})
154
+ # Get count before deletion
155
+ count = len(
156
+ list(
157
+ self.store.documents_table.search().limit(1).to_pydantic(DocumentRecord)
158
+ )
159
+ )
160
+ if count > 0:
161
+ # Drop and recreate table to clear all data
162
+ self.store.db.drop_table("documents")
163
+ self.store.documents_table = self.store.db.create_table(
164
+ "documents", schema=DocumentRecord
165
+ )
211
166
 
212
- deleted = cursor.rowcount > 0
213
- self.store._connection.commit()
214
- return deleted
167
+ async def _create_with_docling(
168
+ self,
169
+ entity: Document,
170
+ docling_document: DoclingDocument,
171
+ chunks: list["Chunk"] | None = None,
172
+ ) -> Document:
173
+ """Create a document with its chunks and embeddings."""
174
+ # Create the document
175
+ created_doc = await self.create(entity)
215
176
 
216
- async def list_all(
217
- self, limit: int | None = None, offset: int | None = None
218
- ) -> list[Document]:
219
- """List all documents with optional pagination."""
220
- if self.store._connection is None:
221
- raise ValueError("Store connection is not available")
177
+ # Create chunks if not provided
178
+ if chunks is None:
179
+ assert created_doc.id is not None, (
180
+ "Document ID should not be None after creation"
181
+ )
182
+ await self.chunk_repository.create_chunks_for_document(
183
+ created_doc.id, docling_document
184
+ )
185
+ else:
186
+ # Use provided chunks, set order from list position
187
+ assert created_doc.id is not None, (
188
+ "Document ID should not be None after creation"
189
+ )
190
+ for order, chunk in enumerate(chunks):
191
+ chunk.document_id = created_doc.id
192
+ chunk.metadata["order"] = order
193
+ await self.chunk_repository.create(chunk)
222
194
 
223
- cursor = self.store._connection.cursor()
224
- query = "SELECT id, content, uri, metadata, created_at, updated_at FROM documents ORDER BY created_at DESC"
225
- params = {}
195
+ return created_doc
226
196
 
227
- if limit is not None:
228
- query += " LIMIT :limit"
229
- params["limit"] = limit
197
+ async def _update_with_docling(
198
+ self, entity: Document, docling_document: DoclingDocument
199
+ ) -> Document:
200
+ """Update a document and regenerate its chunks."""
201
+ # Delete existing chunks
202
+ assert entity.id is not None, "Document ID is required for update"
203
+ await self.chunk_repository.delete_by_document_id(entity.id)
204
+
205
+ # Update the document
206
+ updated_doc = await self.update(entity)
207
+
208
+ # Create new chunks
209
+ assert updated_doc.id is not None, "Document ID should not be None after update"
210
+ await self.chunk_repository.create_chunks_for_document(
211
+ updated_doc.id, docling_document
212
+ )
230
213
 
231
- if offset is not None:
232
- query += " OFFSET :offset"
233
- params["offset"] = offset
234
-
235
- cursor.execute(query, params)
236
- rows = cursor.fetchall()
237
-
238
- return [
239
- Document(
240
- id=document_id,
241
- content=content,
242
- uri=uri,
243
- metadata=json.loads(metadata_json) if metadata_json else {},
244
- created_at=created_at,
245
- updated_at=updated_at,
246
- )
247
- for document_id, content, uri, metadata_json, created_at, updated_at in rows
248
- ]
214
+ return updated_doc
@@ -1,77 +1,143 @@
1
1
  import json
2
- from typing import Any
3
2
 
4
- from haiku.rag.store.engine import Store
3
+ from haiku.rag.config import Config
4
+ from haiku.rag.store.engine import SettingsRecord, Store
5
5
 
6
6
 
7
7
  class ConfigMismatchError(Exception):
8
- """Raised when current config doesn't match stored settings."""
8
+ """Raised when stored config doesn't match current config."""
9
9
 
10
10
  pass
11
11
 
12
12
 
13
13
  class SettingsRepository:
14
- def __init__(self, store: Store):
14
+ """Repository for Settings operations."""
15
+
16
+ def __init__(self, store: Store) -> None:
15
17
  self.store = store
16
18
 
17
- def get(self) -> dict[str, Any]:
18
- """Get all settings from the database."""
19
- if self.store._connection is None:
20
- raise ValueError("Store connection is not available")
19
+ async def create(self, entity: dict) -> dict:
20
+ """Create settings in the database."""
21
+ settings_record = SettingsRecord(id="settings", settings=json.dumps(entity))
22
+ self.store.settings_table.add([settings_record])
23
+ return entity
24
+
25
+ async def get_by_id(self, entity_id: str) -> dict | None:
26
+ """Get settings by ID."""
27
+ results = list(
28
+ self.store.settings_table.search()
29
+ .where(f"id = '{entity_id}'")
30
+ .limit(1)
31
+ .to_pydantic(SettingsRecord)
32
+ )
21
33
 
22
- cursor = self.store._connection.execute("SELECT settings FROM settings LIMIT 1")
23
- row = cursor.fetchone()
24
- if row:
25
- return json.loads(row[0])
26
- return {}
34
+ if not results:
35
+ return None
27
36
 
28
- def save(self) -> None:
29
- """Sync settings from the current AppConfig to database."""
30
- if self.store._connection is None:
31
- raise ValueError("Store connection is not available")
37
+ return json.loads(results[0].settings) if results[0].settings else {}
32
38
 
33
- from haiku.rag.config import Config
39
+ async def update(self, entity: dict) -> dict:
40
+ """Update existing settings."""
41
+ self.store.settings_table.update(
42
+ where="id = 'settings'", values={"settings": json.dumps(entity)}
43
+ )
44
+ return entity
45
+
46
+ async def delete(self, entity_id: str) -> bool:
47
+ """Delete settings by ID."""
48
+ self.store.settings_table.delete(f"id = '{entity_id}'")
49
+ return True
50
+
51
+ async def list_all(
52
+ self, limit: int | None = None, offset: int | None = None
53
+ ) -> list[dict]:
54
+ """List all settings."""
55
+ results = list(self.store.settings_table.search().to_pydantic(SettingsRecord))
56
+ return [
57
+ json.loads(record.settings) if record.settings else {} for record in results
58
+ ]
34
59
 
35
- settings_json = Config.model_dump_json()
60
+ def get_current_settings(self) -> dict:
61
+ """Get the current settings."""
62
+ results = list(
63
+ self.store.settings_table.search()
64
+ .where("id = 'settings'")
65
+ .limit(1)
66
+ .to_pydantic(SettingsRecord)
67
+ )
68
+
69
+ if not results:
70
+ return {}
71
+
72
+ return json.loads(results[0].settings) if results[0].settings else {}
73
+
74
+ def save_current_settings(self) -> None:
75
+ """Save the current configuration to the database."""
76
+ current_config = Config.model_dump(mode="json")
36
77
 
37
- self.store._connection.execute(
38
- "INSERT INTO settings (id, settings) VALUES (1, ?) ON CONFLICT(id) DO UPDATE SET settings = excluded.settings",
39
- (settings_json,),
78
+ # Check if settings exist
79
+ existing = list(
80
+ self.store.settings_table.search()
81
+ .where("id = 'settings'")
82
+ .limit(1)
83
+ .to_pydantic(SettingsRecord)
40
84
  )
41
85
 
42
- self.store._connection.commit()
86
+ if existing:
87
+ # Update existing settings
88
+ self.store.settings_table.update(
89
+ where="id = 'settings'", values={"settings": json.dumps(current_config)}
90
+ )
91
+ else:
92
+ # Create new settings
93
+ settings_record = SettingsRecord(
94
+ id="settings", settings=json.dumps(current_config)
95
+ )
96
+ self.store.settings_table.add([settings_record])
43
97
 
44
98
  def validate_config_compatibility(self) -> None:
45
- """Check if current config is compatible with stored settings.
46
-
47
- Raises ConfigMismatchError if there are incompatible differences.
48
- If no settings exist, saves current config.
49
- """
50
- db_settings = self.get()
51
- if not db_settings:
52
- # No settings in DB, save current config
53
- self.save()
54
- return
99
+ """Validate that the current configuration is compatible with stored settings."""
100
+ stored_settings = self.get_current_settings()
55
101
 
56
- from haiku.rag.config import Config
102
+ # If no stored settings, this is a new database - save current config and return
103
+ if not stored_settings:
104
+ self.save_current_settings()
105
+ return
57
106
 
58
107
  current_config = Config.model_dump(mode="json")
59
108
 
60
- # Critical settings that must match
61
- critical_settings = [
62
- "EMBEDDINGS_PROVIDER",
63
- "EMBEDDINGS_MODEL",
64
- "EMBEDDINGS_VECTOR_DIM",
65
- "CHUNK_SIZE",
66
- ]
109
+ # Check if embedding provider or model has changed
110
+ stored_provider = stored_settings.get("EMBEDDINGS_PROVIDER")
111
+ current_provider = current_config.get("EMBEDDINGS_PROVIDER")
112
+
113
+ stored_model = stored_settings.get("EMBEDDINGS_MODEL")
114
+ current_model = current_config.get("EMBEDDINGS_MODEL")
115
+
116
+ stored_vector_dim = stored_settings.get("EMBEDDINGS_VECTOR_DIM")
117
+ current_vector_dim = current_config.get("EMBEDDINGS_VECTOR_DIM")
118
+
119
+ # Check for incompatible changes
120
+ incompatible_changes = []
121
+
122
+ if stored_provider and stored_provider != current_provider:
123
+ incompatible_changes.append(
124
+ f"Embedding provider changed from '{stored_provider}' to '{current_provider}'"
125
+ )
126
+
127
+ if stored_model and stored_model != current_model:
128
+ incompatible_changes.append(
129
+ f"Embedding model changed from '{stored_model}' to '{current_model}'"
130
+ )
67
131
 
68
- errors = []
69
- for setting in critical_settings:
70
- if db_settings.get(setting) != current_config.get(setting):
71
- errors.append(
72
- f"{setting}: current={current_config.get(setting)}, stored={db_settings.get(setting)}"
73
- )
132
+ if stored_vector_dim and stored_vector_dim != current_vector_dim:
133
+ incompatible_changes.append(
134
+ f"Vector dimension changed from {stored_vector_dim} to {current_vector_dim}"
135
+ )
74
136
 
75
- if errors:
76
- error_msg = f"Config mismatch detected: {'; '.join(errors)}. Consider rebuilding the database with the current configuration."
137
+ if incompatible_changes:
138
+ error_msg = (
139
+ "Database configuration is incompatible with current settings:\n"
140
+ + "\n".join(f" - {change}" for change in incompatible_changes)
141
+ )
142
+ error_msg += "\n\nPlease rebuild the database using: haiku-rag rebuild"
77
143
  raise ConfigMismatchError(error_msg)
@@ -1,3 +1 @@
1
- from haiku.rag.store.upgrades.v0_3_4 import upgrades as v0_3_4_upgrades
2
-
3
- upgrades = v0_3_4_upgrades
1
+ upgrades = []