haiku.rag 0.10.2__py3-none-any.whl → 0.19.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +172 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/METADATA +79 -51
- haiku_rag-0.19.3.dist-info/RECORD +6 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/WHEEL +1 -1
- haiku/rag/__init__.py +0 -0
- haiku/rag/app.py +0 -437
- haiku/rag/chunker.py +0 -51
- haiku/rag/cli.py +0 -466
- haiku/rag/client.py +0 -605
- haiku/rag/config.py +0 -81
- haiku/rag/embeddings/__init__.py +0 -35
- haiku/rag/embeddings/base.py +0 -15
- haiku/rag/embeddings/ollama.py +0 -17
- haiku/rag/embeddings/openai.py +0 -16
- haiku/rag/embeddings/vllm.py +0 -19
- haiku/rag/embeddings/voyageai.py +0 -17
- haiku/rag/logging.py +0 -56
- haiku/rag/mcp.py +0 -156
- haiku/rag/migration.py +0 -316
- haiku/rag/monitor.py +0 -73
- haiku/rag/qa/__init__.py +0 -15
- haiku/rag/qa/agent.py +0 -91
- haiku/rag/qa/prompts.py +0 -60
- haiku/rag/reader.py +0 -115
- haiku/rag/reranking/__init__.py +0 -34
- haiku/rag/reranking/base.py +0 -13
- haiku/rag/reranking/cohere.py +0 -34
- haiku/rag/reranking/mxbai.py +0 -28
- haiku/rag/reranking/vllm.py +0 -44
- haiku/rag/research/__init__.py +0 -20
- haiku/rag/research/common.py +0 -53
- haiku/rag/research/dependencies.py +0 -47
- haiku/rag/research/graph.py +0 -29
- haiku/rag/research/models.py +0 -70
- haiku/rag/research/nodes/evaluate.py +0 -80
- haiku/rag/research/nodes/plan.py +0 -63
- haiku/rag/research/nodes/search.py +0 -93
- haiku/rag/research/nodes/synthesize.py +0 -51
- haiku/rag/research/prompts.py +0 -114
- haiku/rag/research/state.py +0 -25
- haiku/rag/store/__init__.py +0 -4
- haiku/rag/store/engine.py +0 -269
- haiku/rag/store/models/__init__.py +0 -4
- haiku/rag/store/models/chunk.py +0 -17
- haiku/rag/store/models/document.py +0 -17
- haiku/rag/store/repositories/__init__.py +0 -9
- haiku/rag/store/repositories/chunk.py +0 -424
- haiku/rag/store/repositories/document.py +0 -237
- haiku/rag/store/repositories/settings.py +0 -155
- haiku/rag/store/upgrades/__init__.py +0 -62
- haiku/rag/store/upgrades/v0_10_1.py +0 -64
- haiku/rag/store/upgrades/v0_9_3.py +0 -112
- haiku/rag/utils.py +0 -199
- haiku_rag-0.10.2.dist-info/RECORD +0 -54
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.10.2.dist-info → haiku_rag-0.19.3.dist-info}/licenses/LICENSE +0 -0
haiku/rag/store/engine.py
DELETED
|
@@ -1,269 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
from datetime import timedelta
|
|
4
|
-
from importlib import metadata
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from uuid import uuid4
|
|
7
|
-
|
|
8
|
-
import lancedb
|
|
9
|
-
from lancedb.pydantic import LanceModel, Vector
|
|
10
|
-
from pydantic import Field
|
|
11
|
-
|
|
12
|
-
from haiku.rag.config import Config
|
|
13
|
-
from haiku.rag.embeddings import get_embedder
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class DocumentRecord(LanceModel):
|
|
19
|
-
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
20
|
-
content: str
|
|
21
|
-
uri: str | None = None
|
|
22
|
-
title: str | None = None
|
|
23
|
-
metadata: str = Field(default="{}")
|
|
24
|
-
created_at: str = Field(default_factory=lambda: "")
|
|
25
|
-
updated_at: str = Field(default_factory=lambda: "")
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def create_chunk_model(vector_dim: int):
|
|
29
|
-
"""Create a ChunkRecord model with the specified vector dimension.
|
|
30
|
-
|
|
31
|
-
This creates a model with proper vector typing for LanceDB.
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
class ChunkRecord(LanceModel):
|
|
35
|
-
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
36
|
-
document_id: str
|
|
37
|
-
content: str
|
|
38
|
-
metadata: str = Field(default="{}")
|
|
39
|
-
order: int = Field(default=0)
|
|
40
|
-
vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
|
|
41
|
-
|
|
42
|
-
return ChunkRecord
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class SettingsRecord(LanceModel):
|
|
46
|
-
id: str = Field(default="settings")
|
|
47
|
-
settings: str = Field(default="{}")
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class Store:
|
|
51
|
-
def __init__(self, db_path: Path, skip_validation: bool = False):
|
|
52
|
-
self.db_path: Path = db_path
|
|
53
|
-
self.embedder = get_embedder()
|
|
54
|
-
|
|
55
|
-
# Create the ChunkRecord model with the correct vector dimension
|
|
56
|
-
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
57
|
-
|
|
58
|
-
# Local filesystem handling for DB directory
|
|
59
|
-
if not self._has_cloud_config():
|
|
60
|
-
if Config.DISABLE_DB_AUTOCREATE:
|
|
61
|
-
# LanceDB uses a directory path for local databases; enforce presence
|
|
62
|
-
if not db_path.exists():
|
|
63
|
-
raise FileNotFoundError(
|
|
64
|
-
f"LanceDB path does not exist: {db_path}. Auto-creation is disabled."
|
|
65
|
-
)
|
|
66
|
-
else:
|
|
67
|
-
# Ensure parent directories exist when autocreation allowed
|
|
68
|
-
if not db_path.parent.exists():
|
|
69
|
-
Path.mkdir(db_path.parent, parents=True)
|
|
70
|
-
|
|
71
|
-
# Connect to LanceDB
|
|
72
|
-
self.db = self._connect_to_lancedb(db_path)
|
|
73
|
-
|
|
74
|
-
# Initialize tables
|
|
75
|
-
self.create_or_update_db()
|
|
76
|
-
|
|
77
|
-
# Validate config compatibility after connection is established
|
|
78
|
-
if not skip_validation:
|
|
79
|
-
self._validate_configuration()
|
|
80
|
-
|
|
81
|
-
def vacuum(self) -> None:
|
|
82
|
-
"""Optimize and clean up old versions across all tables to reduce disk usage."""
|
|
83
|
-
if self._has_cloud_config() and str(Config.LANCEDB_URI).startswith("db://"):
|
|
84
|
-
return
|
|
85
|
-
|
|
86
|
-
# Perform maintenance per table using optimize() with cleanup_older_than 0
|
|
87
|
-
for table in [self.documents_table, self.chunks_table, self.settings_table]:
|
|
88
|
-
table.optimize(cleanup_older_than=timedelta(0))
|
|
89
|
-
|
|
90
|
-
def _connect_to_lancedb(self, db_path: Path):
|
|
91
|
-
"""Establish connection to LanceDB (local, cloud, or object storage)."""
|
|
92
|
-
# Check if we have cloud configuration
|
|
93
|
-
if self._has_cloud_config():
|
|
94
|
-
return lancedb.connect(
|
|
95
|
-
uri=Config.LANCEDB_URI,
|
|
96
|
-
api_key=Config.LANCEDB_API_KEY,
|
|
97
|
-
region=Config.LANCEDB_REGION,
|
|
98
|
-
)
|
|
99
|
-
else:
|
|
100
|
-
# Local file system connection
|
|
101
|
-
return lancedb.connect(db_path)
|
|
102
|
-
|
|
103
|
-
def _has_cloud_config(self) -> bool:
|
|
104
|
-
"""Check if cloud configuration is complete."""
|
|
105
|
-
return bool(
|
|
106
|
-
Config.LANCEDB_URI and Config.LANCEDB_API_KEY and Config.LANCEDB_REGION
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
def _validate_configuration(self) -> None:
|
|
110
|
-
"""Validate that the configuration is compatible with the database."""
|
|
111
|
-
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
112
|
-
|
|
113
|
-
settings_repo = SettingsRepository(self)
|
|
114
|
-
settings_repo.validate_config_compatibility()
|
|
115
|
-
|
|
116
|
-
def create_or_update_db(self):
|
|
117
|
-
"""Create the database tables."""
|
|
118
|
-
|
|
119
|
-
# Get list of existing tables
|
|
120
|
-
existing_tables = self.db.table_names()
|
|
121
|
-
|
|
122
|
-
# Create or get documents table
|
|
123
|
-
if "documents" in existing_tables:
|
|
124
|
-
self.documents_table = self.db.open_table("documents")
|
|
125
|
-
else:
|
|
126
|
-
self.documents_table = self.db.create_table(
|
|
127
|
-
"documents", schema=DocumentRecord
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
# Create or get chunks table
|
|
131
|
-
if "chunks" in existing_tables:
|
|
132
|
-
self.chunks_table = self.db.open_table("chunks")
|
|
133
|
-
else:
|
|
134
|
-
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
135
|
-
# Create FTS index on the new table with phrase query support
|
|
136
|
-
self.chunks_table.create_fts_index(
|
|
137
|
-
"content", replace=True, with_position=True, remove_stop_words=False
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
# Create or get settings table
|
|
141
|
-
if "settings" in existing_tables:
|
|
142
|
-
self.settings_table = self.db.open_table("settings")
|
|
143
|
-
else:
|
|
144
|
-
self.settings_table = self.db.create_table(
|
|
145
|
-
"settings", schema=SettingsRecord
|
|
146
|
-
)
|
|
147
|
-
# Save current settings to the new database
|
|
148
|
-
settings_data = Config.model_dump(mode="json")
|
|
149
|
-
self.settings_table.add(
|
|
150
|
-
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
# Run pending upgrades based on stored version and package version
|
|
154
|
-
try:
|
|
155
|
-
from haiku.rag.store.upgrades import run_pending_upgrades
|
|
156
|
-
|
|
157
|
-
current_version = metadata.version("haiku.rag")
|
|
158
|
-
db_version = self.get_haiku_version()
|
|
159
|
-
|
|
160
|
-
run_pending_upgrades(self, db_version, current_version)
|
|
161
|
-
|
|
162
|
-
# After upgrades complete (or if none), set stored version
|
|
163
|
-
# to the greater of the installed package version and the
|
|
164
|
-
# highest available upgrade step version in code.
|
|
165
|
-
try:
|
|
166
|
-
from packaging.version import parse as _v
|
|
167
|
-
|
|
168
|
-
from haiku.rag.store.upgrades import upgrades as _steps
|
|
169
|
-
|
|
170
|
-
highest_step = max((_v(u.version) for u in _steps), default=None)
|
|
171
|
-
effective_version = (
|
|
172
|
-
str(max(_v(current_version), highest_step))
|
|
173
|
-
if highest_step is not None
|
|
174
|
-
else current_version
|
|
175
|
-
)
|
|
176
|
-
except Exception:
|
|
177
|
-
effective_version = current_version
|
|
178
|
-
|
|
179
|
-
self.set_haiku_version(effective_version)
|
|
180
|
-
except Exception as e:
|
|
181
|
-
# Avoid hard failure on initial connection; log and continue so CLI remains usable.
|
|
182
|
-
logger.warning(
|
|
183
|
-
"Skipping upgrade due to error (db=%s -> pkg=%s): %s",
|
|
184
|
-
self.get_haiku_version(),
|
|
185
|
-
metadata.version("haiku.rag") if hasattr(metadata, "version") else "",
|
|
186
|
-
e,
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
def get_haiku_version(self) -> str:
|
|
190
|
-
"""Returns the user version stored in settings."""
|
|
191
|
-
settings_records = list(
|
|
192
|
-
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
193
|
-
)
|
|
194
|
-
if settings_records:
|
|
195
|
-
settings = (
|
|
196
|
-
json.loads(settings_records[0].settings)
|
|
197
|
-
if settings_records[0].settings
|
|
198
|
-
else {}
|
|
199
|
-
)
|
|
200
|
-
return settings.get("version", "0.0.0")
|
|
201
|
-
return "0.0.0"
|
|
202
|
-
|
|
203
|
-
def set_haiku_version(self, version: str) -> None:
|
|
204
|
-
"""Updates the user version in settings."""
|
|
205
|
-
settings_records = list(
|
|
206
|
-
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
207
|
-
)
|
|
208
|
-
if settings_records:
|
|
209
|
-
# Only write if version actually changes to avoid creating new table versions
|
|
210
|
-
current = (
|
|
211
|
-
json.loads(settings_records[0].settings)
|
|
212
|
-
if settings_records[0].settings
|
|
213
|
-
else {}
|
|
214
|
-
)
|
|
215
|
-
if current.get("version") != version:
|
|
216
|
-
current["version"] = version
|
|
217
|
-
self.settings_table.update(
|
|
218
|
-
where="id = 'settings'",
|
|
219
|
-
values={"settings": json.dumps(current)},
|
|
220
|
-
)
|
|
221
|
-
else:
|
|
222
|
-
# Create new settings record
|
|
223
|
-
settings_data = Config.model_dump(mode="json")
|
|
224
|
-
settings_data["version"] = version
|
|
225
|
-
self.settings_table.add(
|
|
226
|
-
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
def recreate_embeddings_table(self) -> None:
|
|
230
|
-
"""Recreate the chunks table with current vector dimensions."""
|
|
231
|
-
# Drop and recreate chunks table
|
|
232
|
-
try:
|
|
233
|
-
self.db.drop_table("chunks")
|
|
234
|
-
except Exception:
|
|
235
|
-
pass
|
|
236
|
-
|
|
237
|
-
# Update the ChunkRecord model with new vector dimension
|
|
238
|
-
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
239
|
-
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
240
|
-
|
|
241
|
-
# Create FTS index on the new table with phrase query support
|
|
242
|
-
self.chunks_table.create_fts_index(
|
|
243
|
-
"content", replace=True, with_position=True, remove_stop_words=False
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
def close(self):
|
|
247
|
-
"""Close the database connection."""
|
|
248
|
-
# LanceDB connections are automatically managed
|
|
249
|
-
pass
|
|
250
|
-
|
|
251
|
-
def current_table_versions(self) -> dict[str, int]:
|
|
252
|
-
"""Capture current versions of key tables for rollback using LanceDB's API."""
|
|
253
|
-
return {
|
|
254
|
-
"documents": int(self.documents_table.version),
|
|
255
|
-
"chunks": int(self.chunks_table.version),
|
|
256
|
-
"settings": int(self.settings_table.version),
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
def restore_table_versions(self, versions: dict[str, int]) -> bool:
|
|
260
|
-
"""Restore tables to the provided versions using LanceDB's API."""
|
|
261
|
-
self.documents_table.restore(int(versions["documents"]))
|
|
262
|
-
self.chunks_table.restore(int(versions["chunks"]))
|
|
263
|
-
self.settings_table.restore(int(versions["settings"]))
|
|
264
|
-
return True
|
|
265
|
-
|
|
266
|
-
@property
|
|
267
|
-
def _connection(self):
|
|
268
|
-
"""Compatibility property for repositories expecting _connection."""
|
|
269
|
-
return self
|
haiku/rag/store/models/chunk.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class Chunk(BaseModel):
|
|
5
|
-
"""
|
|
6
|
-
Represents a chunk with content, metadata, and optional document information.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
id: str | None = None
|
|
10
|
-
document_id: str | None = None
|
|
11
|
-
content: str
|
|
12
|
-
metadata: dict = {}
|
|
13
|
-
order: int = 0
|
|
14
|
-
document_uri: str | None = None
|
|
15
|
-
document_title: str | None = None
|
|
16
|
-
document_meta: dict = {}
|
|
17
|
-
embedding: list[float] | None = None
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from datetime import datetime
|
|
2
|
-
|
|
3
|
-
from pydantic import BaseModel, Field
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Document(BaseModel):
|
|
7
|
-
"""
|
|
8
|
-
Represents a document with an ID, content, and metadata.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
id: str | None = None
|
|
12
|
-
content: str
|
|
13
|
-
uri: str | None = None
|
|
14
|
-
title: str | None = None
|
|
15
|
-
metadata: dict = {}
|
|
16
|
-
created_at: datetime = Field(default_factory=datetime.now)
|
|
17
|
-
updated_at: datetime = Field(default_factory=datetime.now)
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
2
|
-
from haiku.rag.store.repositories.document import DocumentRepository
|
|
3
|
-
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
4
|
-
|
|
5
|
-
__all__ = [
|
|
6
|
-
"ChunkRepository",
|
|
7
|
-
"DocumentRepository",
|
|
8
|
-
"SettingsRepository",
|
|
9
|
-
]
|