haiku.rag 0.10.2__py3-none-any.whl → 0.19.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. README.md +172 -0
  2. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/METADATA +79 -51
  3. haiku_rag-0.19.3.dist-info/RECORD +6 -0
  4. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/WHEEL +1 -1
  5. haiku/rag/__init__.py +0 -0
  6. haiku/rag/app.py +0 -437
  7. haiku/rag/chunker.py +0 -51
  8. haiku/rag/cli.py +0 -466
  9. haiku/rag/client.py +0 -605
  10. haiku/rag/config.py +0 -81
  11. haiku/rag/embeddings/__init__.py +0 -35
  12. haiku/rag/embeddings/base.py +0 -15
  13. haiku/rag/embeddings/ollama.py +0 -17
  14. haiku/rag/embeddings/openai.py +0 -16
  15. haiku/rag/embeddings/vllm.py +0 -19
  16. haiku/rag/embeddings/voyageai.py +0 -17
  17. haiku/rag/logging.py +0 -56
  18. haiku/rag/mcp.py +0 -156
  19. haiku/rag/migration.py +0 -316
  20. haiku/rag/monitor.py +0 -73
  21. haiku/rag/qa/__init__.py +0 -15
  22. haiku/rag/qa/agent.py +0 -91
  23. haiku/rag/qa/prompts.py +0 -60
  24. haiku/rag/reader.py +0 -115
  25. haiku/rag/reranking/__init__.py +0 -34
  26. haiku/rag/reranking/base.py +0 -13
  27. haiku/rag/reranking/cohere.py +0 -34
  28. haiku/rag/reranking/mxbai.py +0 -28
  29. haiku/rag/reranking/vllm.py +0 -44
  30. haiku/rag/research/__init__.py +0 -20
  31. haiku/rag/research/common.py +0 -53
  32. haiku/rag/research/dependencies.py +0 -47
  33. haiku/rag/research/graph.py +0 -29
  34. haiku/rag/research/models.py +0 -70
  35. haiku/rag/research/nodes/evaluate.py +0 -80
  36. haiku/rag/research/nodes/plan.py +0 -63
  37. haiku/rag/research/nodes/search.py +0 -93
  38. haiku/rag/research/nodes/synthesize.py +0 -51
  39. haiku/rag/research/prompts.py +0 -114
  40. haiku/rag/research/state.py +0 -25
  41. haiku/rag/store/__init__.py +0 -4
  42. haiku/rag/store/engine.py +0 -269
  43. haiku/rag/store/models/__init__.py +0 -4
  44. haiku/rag/store/models/chunk.py +0 -17
  45. haiku/rag/store/models/document.py +0 -17
  46. haiku/rag/store/repositories/__init__.py +0 -9
  47. haiku/rag/store/repositories/chunk.py +0 -424
  48. haiku/rag/store/repositories/document.py +0 -237
  49. haiku/rag/store/repositories/settings.py +0 -155
  50. haiku/rag/store/upgrades/__init__.py +0 -62
  51. haiku/rag/store/upgrades/v0_10_1.py +0 -64
  52. haiku/rag/store/upgrades/v0_9_3.py +0 -112
  53. haiku/rag/utils.py +0 -199
  54. haiku_rag-0.10.2.dist-info/RECORD +0 -54
  55. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/entry_points.txt +0 -0
  56. {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/licenses/LICENSE +0 -0
haiku/rag/store/engine.py DELETED
@@ -1,269 +0,0 @@
1
- import json
2
- import logging
3
- from datetime import timedelta
4
- from importlib import metadata
5
- from pathlib import Path
6
- from uuid import uuid4
7
-
8
- import lancedb
9
- from lancedb.pydantic import LanceModel, Vector
10
- from pydantic import Field
11
-
12
- from haiku.rag.config import Config
13
- from haiku.rag.embeddings import get_embedder
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class DocumentRecord(LanceModel):
19
- id: str = Field(default_factory=lambda: str(uuid4()))
20
- content: str
21
- uri: str | None = None
22
- title: str | None = None
23
- metadata: str = Field(default="{}")
24
- created_at: str = Field(default_factory=lambda: "")
25
- updated_at: str = Field(default_factory=lambda: "")
26
-
27
-
28
- def create_chunk_model(vector_dim: int):
29
- """Create a ChunkRecord model with the specified vector dimension.
30
-
31
- This creates a model with proper vector typing for LanceDB.
32
- """
33
-
34
- class ChunkRecord(LanceModel):
35
- id: str = Field(default_factory=lambda: str(uuid4()))
36
- document_id: str
37
- content: str
38
- metadata: str = Field(default="{}")
39
- order: int = Field(default=0)
40
- vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
41
-
42
- return ChunkRecord
43
-
44
-
45
- class SettingsRecord(LanceModel):
46
- id: str = Field(default="settings")
47
- settings: str = Field(default="{}")
48
-
49
-
50
- class Store:
51
- def __init__(self, db_path: Path, skip_validation: bool = False):
52
- self.db_path: Path = db_path
53
- self.embedder = get_embedder()
54
-
55
- # Create the ChunkRecord model with the correct vector dimension
56
- self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
57
-
58
- # Local filesystem handling for DB directory
59
- if not self._has_cloud_config():
60
- if Config.DISABLE_DB_AUTOCREATE:
61
- # LanceDB uses a directory path for local databases; enforce presence
62
- if not db_path.exists():
63
- raise FileNotFoundError(
64
- f"LanceDB path does not exist: {db_path}. Auto-creation is disabled."
65
- )
66
- else:
67
- # Ensure parent directories exist when autocreation allowed
68
- if not db_path.parent.exists():
69
- Path.mkdir(db_path.parent, parents=True)
70
-
71
- # Connect to LanceDB
72
- self.db = self._connect_to_lancedb(db_path)
73
-
74
- # Initialize tables
75
- self.create_or_update_db()
76
-
77
- # Validate config compatibility after connection is established
78
- if not skip_validation:
79
- self._validate_configuration()
80
-
81
- def vacuum(self) -> None:
82
- """Optimize and clean up old versions across all tables to reduce disk usage."""
83
- if self._has_cloud_config() and str(Config.LANCEDB_URI).startswith("db://"):
84
- return
85
-
86
- # Perform maintenance per table using optimize() with cleanup_older_than 0
87
- for table in [self.documents_table, self.chunks_table, self.settings_table]:
88
- table.optimize(cleanup_older_than=timedelta(0))
89
-
90
- def _connect_to_lancedb(self, db_path: Path):
91
- """Establish connection to LanceDB (local, cloud, or object storage)."""
92
- # Check if we have cloud configuration
93
- if self._has_cloud_config():
94
- return lancedb.connect(
95
- uri=Config.LANCEDB_URI,
96
- api_key=Config.LANCEDB_API_KEY,
97
- region=Config.LANCEDB_REGION,
98
- )
99
- else:
100
- # Local file system connection
101
- return lancedb.connect(db_path)
102
-
103
- def _has_cloud_config(self) -> bool:
104
- """Check if cloud configuration is complete."""
105
- return bool(
106
- Config.LANCEDB_URI and Config.LANCEDB_API_KEY and Config.LANCEDB_REGION
107
- )
108
-
109
- def _validate_configuration(self) -> None:
110
- """Validate that the configuration is compatible with the database."""
111
- from haiku.rag.store.repositories.settings import SettingsRepository
112
-
113
- settings_repo = SettingsRepository(self)
114
- settings_repo.validate_config_compatibility()
115
-
116
- def create_or_update_db(self):
117
- """Create the database tables."""
118
-
119
- # Get list of existing tables
120
- existing_tables = self.db.table_names()
121
-
122
- # Create or get documents table
123
- if "documents" in existing_tables:
124
- self.documents_table = self.db.open_table("documents")
125
- else:
126
- self.documents_table = self.db.create_table(
127
- "documents", schema=DocumentRecord
128
- )
129
-
130
- # Create or get chunks table
131
- if "chunks" in existing_tables:
132
- self.chunks_table = self.db.open_table("chunks")
133
- else:
134
- self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
135
- # Create FTS index on the new table with phrase query support
136
- self.chunks_table.create_fts_index(
137
- "content", replace=True, with_position=True, remove_stop_words=False
138
- )
139
-
140
- # Create or get settings table
141
- if "settings" in existing_tables:
142
- self.settings_table = self.db.open_table("settings")
143
- else:
144
- self.settings_table = self.db.create_table(
145
- "settings", schema=SettingsRecord
146
- )
147
- # Save current settings to the new database
148
- settings_data = Config.model_dump(mode="json")
149
- self.settings_table.add(
150
- [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
151
- )
152
-
153
- # Run pending upgrades based on stored version and package version
154
- try:
155
- from haiku.rag.store.upgrades import run_pending_upgrades
156
-
157
- current_version = metadata.version("haiku.rag")
158
- db_version = self.get_haiku_version()
159
-
160
- run_pending_upgrades(self, db_version, current_version)
161
-
162
- # After upgrades complete (or if none), set stored version
163
- # to the greater of the installed package version and the
164
- # highest available upgrade step version in code.
165
- try:
166
- from packaging.version import parse as _v
167
-
168
- from haiku.rag.store.upgrades import upgrades as _steps
169
-
170
- highest_step = max((_v(u.version) for u in _steps), default=None)
171
- effective_version = (
172
- str(max(_v(current_version), highest_step))
173
- if highest_step is not None
174
- else current_version
175
- )
176
- except Exception:
177
- effective_version = current_version
178
-
179
- self.set_haiku_version(effective_version)
180
- except Exception as e:
181
- # Avoid hard failure on initial connection; log and continue so CLI remains usable.
182
- logger.warning(
183
- "Skipping upgrade due to error (db=%s -> pkg=%s): %s",
184
- self.get_haiku_version(),
185
- metadata.version("haiku.rag") if hasattr(metadata, "version") else "",
186
- e,
187
- )
188
-
189
- def get_haiku_version(self) -> str:
190
- """Returns the user version stored in settings."""
191
- settings_records = list(
192
- self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
193
- )
194
- if settings_records:
195
- settings = (
196
- json.loads(settings_records[0].settings)
197
- if settings_records[0].settings
198
- else {}
199
- )
200
- return settings.get("version", "0.0.0")
201
- return "0.0.0"
202
-
203
- def set_haiku_version(self, version: str) -> None:
204
- """Updates the user version in settings."""
205
- settings_records = list(
206
- self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
207
- )
208
- if settings_records:
209
- # Only write if version actually changes to avoid creating new table versions
210
- current = (
211
- json.loads(settings_records[0].settings)
212
- if settings_records[0].settings
213
- else {}
214
- )
215
- if current.get("version") != version:
216
- current["version"] = version
217
- self.settings_table.update(
218
- where="id = 'settings'",
219
- values={"settings": json.dumps(current)},
220
- )
221
- else:
222
- # Create new settings record
223
- settings_data = Config.model_dump(mode="json")
224
- settings_data["version"] = version
225
- self.settings_table.add(
226
- [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
227
- )
228
-
229
- def recreate_embeddings_table(self) -> None:
230
- """Recreate the chunks table with current vector dimensions."""
231
- # Drop and recreate chunks table
232
- try:
233
- self.db.drop_table("chunks")
234
- except Exception:
235
- pass
236
-
237
- # Update the ChunkRecord model with new vector dimension
238
- self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
239
- self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
240
-
241
- # Create FTS index on the new table with phrase query support
242
- self.chunks_table.create_fts_index(
243
- "content", replace=True, with_position=True, remove_stop_words=False
244
- )
245
-
246
- def close(self):
247
- """Close the database connection."""
248
- # LanceDB connections are automatically managed
249
- pass
250
-
251
- def current_table_versions(self) -> dict[str, int]:
252
- """Capture current versions of key tables for rollback using LanceDB's API."""
253
- return {
254
- "documents": int(self.documents_table.version),
255
- "chunks": int(self.chunks_table.version),
256
- "settings": int(self.settings_table.version),
257
- }
258
-
259
- def restore_table_versions(self, versions: dict[str, int]) -> bool:
260
- """Restore tables to the provided versions using LanceDB's API."""
261
- self.documents_table.restore(int(versions["documents"]))
262
- self.chunks_table.restore(int(versions["chunks"]))
263
- self.settings_table.restore(int(versions["settings"]))
264
- return True
265
-
266
- @property
267
- def _connection(self):
268
- """Compatibility property for repositories expecting _connection."""
269
- return self
@@ -1,4 +0,0 @@
1
- from .chunk import Chunk
2
- from .document import Document
3
-
4
- __all__ = ["Chunk", "Document"]
@@ -1,17 +0,0 @@
1
- from pydantic import BaseModel
2
-
3
-
4
- class Chunk(BaseModel):
5
- """
6
- Represents a chunk with content, metadata, and optional document information.
7
- """
8
-
9
- id: str | None = None
10
- document_id: str | None = None
11
- content: str
12
- metadata: dict = {}
13
- order: int = 0
14
- document_uri: str | None = None
15
- document_title: str | None = None
16
- document_meta: dict = {}
17
- embedding: list[float] | None = None
@@ -1,17 +0,0 @@
1
- from datetime import datetime
2
-
3
- from pydantic import BaseModel, Field
4
-
5
-
6
- class Document(BaseModel):
7
- """
8
- Represents a document with an ID, content, and metadata.
9
- """
10
-
11
- id: str | None = None
12
- content: str
13
- uri: str | None = None
14
- title: str | None = None
15
- metadata: dict = {}
16
- created_at: datetime = Field(default_factory=datetime.now)
17
- updated_at: datetime = Field(default_factory=datetime.now)
@@ -1,9 +0,0 @@
1
- from haiku.rag.store.repositories.chunk import ChunkRepository
2
- from haiku.rag.store.repositories.document import DocumentRepository
3
- from haiku.rag.store.repositories.settings import SettingsRepository
4
-
5
- __all__ = [
6
- "ChunkRepository",
7
- "DocumentRepository",
8
- "SettingsRepository",
9
- ]