haiku.rag 0.5.5__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/app.py +4 -4
- haiku/rag/cli.py +38 -27
- haiku/rag/client.py +19 -23
- haiku/rag/config.py +6 -2
- haiku/rag/embeddings/__init__.py +3 -9
- haiku/rag/embeddings/openai.py +10 -13
- haiku/rag/logging.py +4 -0
- haiku/rag/mcp.py +12 -9
- haiku/rag/migration.py +316 -0
- haiku/rag/qa/__init__.py +10 -39
- haiku/rag/qa/agent.py +76 -0
- haiku/rag/qa/prompts.py +2 -0
- haiku/rag/reranking/__init__.py +0 -6
- haiku/rag/store/engine.py +173 -141
- haiku/rag/store/models/chunk.py +2 -2
- haiku/rag/store/models/document.py +1 -1
- haiku/rag/store/repositories/__init__.py +6 -2
- haiku/rag/store/repositories/chunk.py +279 -414
- haiku/rag/store/repositories/document.py +171 -205
- haiku/rag/store/repositories/settings.py +115 -49
- haiku/rag/store/upgrades/__init__.py +1 -3
- haiku/rag/utils.py +39 -31
- {haiku_rag-0.5.5.dist-info → haiku_rag-0.7.0.dist-info}/METADATA +22 -22
- haiku_rag-0.7.0.dist-info/RECORD +39 -0
- haiku/rag/qa/anthropic.py +0 -108
- haiku/rag/qa/base.py +0 -89
- haiku/rag/qa/ollama.py +0 -60
- haiku/rag/qa/openai.py +0 -97
- haiku/rag/reranking/ollama.py +0 -84
- haiku/rag/store/repositories/base.py +0 -40
- haiku/rag/store/upgrades/v0_3_4.py +0 -26
- haiku_rag-0.5.5.dist-info/RECORD +0 -44
- {haiku_rag-0.5.5.dist-info → haiku_rag-0.7.0.dist-info}/WHEEL +0 -0
- {haiku_rag-0.5.5.dist-info → haiku_rag-0.7.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.5.5.dist-info → haiku_rag-0.7.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/store/engine.py
CHANGED
|
@@ -1,171 +1,203 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
3
|
from importlib import metadata
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from
|
|
5
|
+
from uuid import uuid4
|
|
6
6
|
|
|
7
|
-
import
|
|
8
|
-
from
|
|
9
|
-
from
|
|
7
|
+
import lancedb
|
|
8
|
+
from lancedb.pydantic import LanceModel, Vector
|
|
9
|
+
from pydantic import Field
|
|
10
10
|
|
|
11
11
|
from haiku.rag.config import Config
|
|
12
12
|
from haiku.rag.embeddings import get_embedder
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocumentRecord(LanceModel):
|
|
18
|
+
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
19
|
+
content: str
|
|
20
|
+
uri: str | None = None
|
|
21
|
+
metadata: str = Field(default="{}")
|
|
22
|
+
created_at: str = Field(default_factory=lambda: "")
|
|
23
|
+
updated_at: str = Field(default_factory=lambda: "")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def create_chunk_model(vector_dim: int):
|
|
27
|
+
"""Create a ChunkRecord model with the specified vector dimension.
|
|
28
|
+
|
|
29
|
+
This creates a model with proper vector typing for LanceDB.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
class ChunkRecord(LanceModel):
|
|
33
|
+
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
34
|
+
document_id: str
|
|
35
|
+
content: str
|
|
36
|
+
metadata: str = Field(default="{}")
|
|
37
|
+
vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
|
|
38
|
+
|
|
39
|
+
return ChunkRecord
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class SettingsRecord(LanceModel):
|
|
43
|
+
id: str = Field(default="settings")
|
|
44
|
+
settings: str = Field(default="{}")
|
|
15
45
|
|
|
16
46
|
|
|
17
47
|
class Store:
|
|
18
|
-
def __init__(
|
|
19
|
-
self
|
|
20
|
-
|
|
21
|
-
|
|
48
|
+
def __init__(self, db_path: Path, skip_validation: bool = False):
|
|
49
|
+
self.db_path: Path = db_path
|
|
50
|
+
self.embedder = get_embedder()
|
|
51
|
+
|
|
52
|
+
# Create the ChunkRecord model with the correct vector dimension
|
|
53
|
+
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
54
|
+
|
|
55
|
+
# Connect to LanceDB
|
|
56
|
+
self.db = self._connect_to_lancedb(db_path)
|
|
57
|
+
|
|
58
|
+
# Initialize tables
|
|
22
59
|
self.create_or_update_db()
|
|
23
60
|
|
|
24
61
|
# Validate config compatibility after connection is established
|
|
25
62
|
if not skip_validation:
|
|
26
|
-
|
|
63
|
+
self._validate_configuration()
|
|
64
|
+
|
|
65
|
+
def _connect_to_lancedb(self, db_path: Path):
|
|
66
|
+
"""Establish connection to LanceDB (local, cloud, or object storage)."""
|
|
67
|
+
# Check if we have cloud configuration
|
|
68
|
+
if self._has_cloud_config():
|
|
69
|
+
return lancedb.connect(
|
|
70
|
+
uri=Config.LANCEDB_URI,
|
|
71
|
+
api_key=Config.LANCEDB_API_KEY,
|
|
72
|
+
region=Config.LANCEDB_REGION,
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
# Local file system connection
|
|
76
|
+
return lancedb.connect(db_path)
|
|
77
|
+
|
|
78
|
+
def _has_cloud_config(self) -> bool:
|
|
79
|
+
"""Check if cloud configuration is complete."""
|
|
80
|
+
return bool(
|
|
81
|
+
Config.LANCEDB_URI and Config.LANCEDB_API_KEY and Config.LANCEDB_REGION
|
|
82
|
+
)
|
|
27
83
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
84
|
+
def _validate_configuration(self) -> None:
|
|
85
|
+
"""Validate that the configuration is compatible with the database."""
|
|
86
|
+
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
87
|
+
|
|
88
|
+
settings_repo = SettingsRepository(self)
|
|
89
|
+
settings_repo.validate_config_compatibility()
|
|
32
90
|
|
|
33
91
|
def create_or_update_db(self):
|
|
34
|
-
"""Create the database
|
|
35
|
-
current_version = metadata.version("haiku.rag")
|
|
92
|
+
"""Create the database tables."""
|
|
36
93
|
|
|
37
|
-
|
|
38
|
-
db.
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
existing_tables = [
|
|
47
|
-
row[0]
|
|
48
|
-
for row in db.execute(
|
|
49
|
-
"SELECT name FROM sqlite_master WHERE type='table';"
|
|
50
|
-
).fetchall()
|
|
51
|
-
]
|
|
52
|
-
|
|
53
|
-
# If we have a db already, perform upgrades and return
|
|
54
|
-
if self.db_path != ":memory:" and "documents" in existing_tables:
|
|
55
|
-
# Upgrade database
|
|
56
|
-
console = Console()
|
|
57
|
-
db_version = self.get_user_version()
|
|
58
|
-
for version, steps in upgrades:
|
|
59
|
-
if parse(current_version) >= parse(version) and parse(version) > parse(
|
|
60
|
-
db_version
|
|
61
|
-
):
|
|
62
|
-
for step in steps:
|
|
63
|
-
step(db)
|
|
64
|
-
console.print(
|
|
65
|
-
f"[green][b]DB Upgrade: [/b]{step.__doc__}[/green]"
|
|
66
|
-
)
|
|
67
|
-
return
|
|
68
|
-
|
|
69
|
-
# Create documents table
|
|
70
|
-
db.execute("""
|
|
71
|
-
CREATE TABLE IF NOT EXISTS documents (
|
|
72
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
73
|
-
content TEXT NOT NULL,
|
|
74
|
-
uri TEXT,
|
|
75
|
-
metadata TEXT DEFAULT '{}',
|
|
76
|
-
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
77
|
-
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
78
|
-
)
|
|
79
|
-
""")
|
|
80
|
-
# Create chunks table
|
|
81
|
-
db.execute("""
|
|
82
|
-
CREATE TABLE IF NOT EXISTS chunks (
|
|
83
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
84
|
-
document_id INTEGER NOT NULL,
|
|
85
|
-
content TEXT NOT NULL,
|
|
86
|
-
metadata TEXT DEFAULT '{}',
|
|
87
|
-
FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE
|
|
88
|
-
)
|
|
89
|
-
""")
|
|
90
|
-
# Create vector table for chunk embeddings
|
|
91
|
-
embedder = get_embedder()
|
|
92
|
-
db.execute(f"""
|
|
93
|
-
CREATE VIRTUAL TABLE IF NOT EXISTS chunk_embeddings USING vec0(
|
|
94
|
-
chunk_id INTEGER PRIMARY KEY,
|
|
95
|
-
embedding FLOAT[{embedder._vector_dim}]
|
|
94
|
+
# Get list of existing tables
|
|
95
|
+
existing_tables = self.db.table_names()
|
|
96
|
+
|
|
97
|
+
# Create or get documents table
|
|
98
|
+
if "documents" in existing_tables:
|
|
99
|
+
self.documents_table = self.db.open_table("documents")
|
|
100
|
+
else:
|
|
101
|
+
self.documents_table = self.db.create_table(
|
|
102
|
+
"documents", schema=DocumentRecord
|
|
96
103
|
)
|
|
97
|
-
|
|
98
|
-
# Create
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
+
|
|
105
|
+
# Create or get chunks table
|
|
106
|
+
if "chunks" in existing_tables:
|
|
107
|
+
self.chunks_table = self.db.open_table("chunks")
|
|
108
|
+
else:
|
|
109
|
+
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
110
|
+
# Create FTS index on the new table
|
|
111
|
+
self.chunks_table.create_fts_index("content", replace=True)
|
|
112
|
+
|
|
113
|
+
# Create or get settings table
|
|
114
|
+
if "settings" in existing_tables:
|
|
115
|
+
self.settings_table = self.db.open_table("settings")
|
|
116
|
+
else:
|
|
117
|
+
self.settings_table = self.db.create_table(
|
|
118
|
+
"settings", schema=SettingsRecord
|
|
104
119
|
)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
id INTEGER PRIMARY KEY DEFAULT 1,
|
|
110
|
-
settings TEXT NOT NULL DEFAULT '{}'
|
|
120
|
+
# Save current settings to the new database
|
|
121
|
+
settings_data = Config.model_dump(mode="json")
|
|
122
|
+
self.settings_table.add(
|
|
123
|
+
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
111
124
|
)
|
|
112
|
-
""")
|
|
113
|
-
# Save current settings to the new database
|
|
114
|
-
settings_json = Config.model_dump_json()
|
|
115
|
-
db.execute(
|
|
116
|
-
"INSERT OR IGNORE INTO settings (id, settings) VALUES (1, ?)",
|
|
117
|
-
(settings_json,),
|
|
118
|
-
)
|
|
119
|
-
# Create indexes for better performance
|
|
120
|
-
db.execute(
|
|
121
|
-
"CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)"
|
|
122
|
-
)
|
|
123
|
-
db.commit()
|
|
124
|
-
|
|
125
|
-
def get_user_version(self) -> str:
|
|
126
|
-
"""Returns the SQLite user version"""
|
|
127
|
-
if self._connection is None:
|
|
128
|
-
raise ValueError("Store connection is not available")
|
|
129
125
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
126
|
+
# Set current version in settings
|
|
127
|
+
current_version = metadata.version("haiku.rag")
|
|
128
|
+
self.set_haiku_version(current_version)
|
|
133
129
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
130
|
+
# Check if we need to perform upgrades
|
|
131
|
+
try:
|
|
132
|
+
existing_settings = list(
|
|
133
|
+
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
134
|
+
)
|
|
135
|
+
if existing_settings:
|
|
136
|
+
db_version = self.get_haiku_version() # noqa: F841
|
|
137
|
+
# TODO: Add upgrade logic here similar to SQLite version when needed
|
|
138
|
+
except Exception:
|
|
139
|
+
# Settings table might not exist yet in fresh databases
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
def get_haiku_version(self) -> str:
|
|
143
|
+
"""Returns the user version stored in settings."""
|
|
144
|
+
settings_records = list(
|
|
145
|
+
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
146
|
+
)
|
|
147
|
+
if settings_records:
|
|
148
|
+
settings = (
|
|
149
|
+
json.loads(settings_records[0].settings)
|
|
150
|
+
if settings_records[0].settings
|
|
151
|
+
else {}
|
|
152
|
+
)
|
|
153
|
+
return settings.get("version", "0.0.0")
|
|
154
|
+
return "0.0.0"
|
|
138
155
|
|
|
139
|
-
|
|
140
|
-
|
|
156
|
+
def set_haiku_version(self, version: str) -> None:
|
|
157
|
+
"""Updates the user version in settings."""
|
|
158
|
+
settings_records = list(
|
|
159
|
+
self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
|
|
141
160
|
)
|
|
161
|
+
if settings_records:
|
|
162
|
+
settings = (
|
|
163
|
+
json.loads(settings_records[0].settings)
|
|
164
|
+
if settings_records[0].settings
|
|
165
|
+
else {}
|
|
166
|
+
)
|
|
167
|
+
settings["version"] = version
|
|
168
|
+
# Update the record
|
|
169
|
+
self.settings_table.update(
|
|
170
|
+
where="id = 'settings'", values={"settings": json.dumps(settings)}
|
|
171
|
+
)
|
|
172
|
+
else:
|
|
173
|
+
# Create new settings record
|
|
174
|
+
settings_data = Config.model_dump(mode="json")
|
|
175
|
+
settings_data["version"] = version
|
|
176
|
+
self.settings_table.add(
|
|
177
|
+
[SettingsRecord(id="settings", settings=json.dumps(settings_data))]
|
|
178
|
+
)
|
|
142
179
|
|
|
143
180
|
def recreate_embeddings_table(self) -> None:
|
|
144
|
-
"""Recreate the
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
# Recreate with current dimensions
|
|
152
|
-
embedder = get_embedder()
|
|
153
|
-
self._connection.execute(f"""
|
|
154
|
-
CREATE VIRTUAL TABLE chunk_embeddings USING vec0(
|
|
155
|
-
chunk_id INTEGER PRIMARY KEY,
|
|
156
|
-
embedding FLOAT[{embedder._vector_dim}]
|
|
157
|
-
)
|
|
158
|
-
""")
|
|
181
|
+
"""Recreate the chunks table with current vector dimensions."""
|
|
182
|
+
# Drop and recreate chunks table
|
|
183
|
+
try:
|
|
184
|
+
self.db.drop_table("chunks")
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
159
187
|
|
|
160
|
-
|
|
188
|
+
# Update the ChunkRecord model with new vector dimension
|
|
189
|
+
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
190
|
+
self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
|
|
161
191
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
"""Serialize a list of floats to bytes for sqlite-vec storage."""
|
|
165
|
-
return struct.pack(f"{len(embedding)}f", *embedding)
|
|
192
|
+
# Create FTS index on the new table
|
|
193
|
+
self.chunks_table.create_fts_index("content", replace=True)
|
|
166
194
|
|
|
167
195
|
def close(self):
|
|
168
|
-
"""Close the database connection
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
196
|
+
"""Close the database connection."""
|
|
197
|
+
# LanceDB connections are automatically managed
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
@property
|
|
201
|
+
def _connection(self):
|
|
202
|
+
"""Compatibility property for repositories expecting _connection."""
|
|
203
|
+
return self
|
haiku/rag/store/models/chunk.py
CHANGED
|
@@ -6,8 +6,8 @@ class Chunk(BaseModel):
|
|
|
6
6
|
Represents a chunk with content, metadata, and optional document information.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
id:
|
|
10
|
-
document_id:
|
|
9
|
+
id: str | None = None
|
|
10
|
+
document_id: str | None = None
|
|
11
11
|
content: str
|
|
12
12
|
metadata: dict = {}
|
|
13
13
|
document_uri: str | None = None
|
|
@@ -1,5 +1,9 @@
|
|
|
1
|
-
from haiku.rag.store.repositories.base import BaseRepository
|
|
2
1
|
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
3
2
|
from haiku.rag.store.repositories.document import DocumentRepository
|
|
3
|
+
from haiku.rag.store.repositories.settings import SettingsRepository
|
|
4
4
|
|
|
5
|
-
__all__ = [
|
|
5
|
+
__all__ = [
|
|
6
|
+
"ChunkRepository",
|
|
7
|
+
"DocumentRepository",
|
|
8
|
+
"SettingsRepository",
|
|
9
|
+
]
|