haiku.rag 0.5.5__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/store/engine.py CHANGED
@@ -1,171 +1,203 @@
1
- import sqlite3
2
- import struct
1
+ import json
2
+ import logging
3
3
  from importlib import metadata
4
4
  from pathlib import Path
5
- from typing import Literal
5
+ from uuid import uuid4
6
6
 
7
- import sqlite_vec
8
- from packaging.version import parse
9
- from rich.console import Console
7
+ import lancedb
8
+ from lancedb.pydantic import LanceModel, Vector
9
+ from pydantic import Field
10
10
 
11
11
  from haiku.rag.config import Config
12
12
  from haiku.rag.embeddings import get_embedder
13
- from haiku.rag.store.upgrades import upgrades
14
- from haiku.rag.utils import int_to_semantic_version, semantic_version_to_int
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DocumentRecord(LanceModel):
18
+ id: str = Field(default_factory=lambda: str(uuid4()))
19
+ content: str
20
+ uri: str | None = None
21
+ metadata: str = Field(default="{}")
22
+ created_at: str = Field(default_factory=lambda: "")
23
+ updated_at: str = Field(default_factory=lambda: "")
24
+
25
+
26
+ def create_chunk_model(vector_dim: int):
27
+ """Create a ChunkRecord model with the specified vector dimension.
28
+
29
+ This creates a model with proper vector typing for LanceDB.
30
+ """
31
+
32
+ class ChunkRecord(LanceModel):
33
+ id: str = Field(default_factory=lambda: str(uuid4()))
34
+ document_id: str
35
+ content: str
36
+ metadata: str = Field(default="{}")
37
+ vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim) # type: ignore
38
+
39
+ return ChunkRecord
40
+
41
+
42
+ class SettingsRecord(LanceModel):
43
+ id: str = Field(default="settings")
44
+ settings: str = Field(default="{}")
15
45
 
16
46
 
17
47
  class Store:
18
- def __init__(
19
- self, db_path: Path | Literal[":memory:"], skip_validation: bool = False
20
- ):
21
- self.db_path: Path | Literal[":memory:"] = db_path
48
+ def __init__(self, db_path: Path, skip_validation: bool = False):
49
+ self.db_path: Path = db_path
50
+ self.embedder = get_embedder()
51
+
52
+ # Create the ChunkRecord model with the correct vector dimension
53
+ self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
54
+
55
+ # Connect to LanceDB
56
+ self.db = self._connect_to_lancedb(db_path)
57
+
58
+ # Initialize tables
22
59
  self.create_or_update_db()
23
60
 
24
61
  # Validate config compatibility after connection is established
25
62
  if not skip_validation:
26
- from haiku.rag.store.repositories.settings import SettingsRepository
63
+ self._validate_configuration()
64
+
65
+ def _connect_to_lancedb(self, db_path: Path):
66
+ """Establish connection to LanceDB (local, cloud, or object storage)."""
67
+ # Check if we have cloud configuration
68
+ if self._has_cloud_config():
69
+ return lancedb.connect(
70
+ uri=Config.LANCEDB_URI,
71
+ api_key=Config.LANCEDB_API_KEY,
72
+ region=Config.LANCEDB_REGION,
73
+ )
74
+ else:
75
+ # Local file system connection
76
+ return lancedb.connect(db_path)
77
+
78
+ def _has_cloud_config(self) -> bool:
79
+ """Check if cloud configuration is complete."""
80
+ return bool(
81
+ Config.LANCEDB_URI and Config.LANCEDB_API_KEY and Config.LANCEDB_REGION
82
+ )
27
83
 
28
- settings_repo = SettingsRepository(self)
29
- settings_repo.validate_config_compatibility()
30
- current_version = metadata.version("haiku.rag")
31
- self.set_user_version(current_version)
84
+ def _validate_configuration(self) -> None:
85
+ """Validate that the configuration is compatible with the database."""
86
+ from haiku.rag.store.repositories.settings import SettingsRepository
87
+
88
+ settings_repo = SettingsRepository(self)
89
+ settings_repo.validate_config_compatibility()
32
90
 
33
91
  def create_or_update_db(self):
34
- """Create the database and tables with sqlite-vec support for embeddings."""
35
- current_version = metadata.version("haiku.rag")
92
+ """Create the database tables."""
36
93
 
37
- db = sqlite3.connect(self.db_path)
38
- db.enable_load_extension(True)
39
- sqlite_vec.load(db)
40
-
41
- # Enable WAL mode for better concurrency (skip for in-memory databases)
42
- if self.db_path != ":memory:":
43
- db.execute("PRAGMA journal_mode=WAL")
44
-
45
- self._connection = db
46
- existing_tables = [
47
- row[0]
48
- for row in db.execute(
49
- "SELECT name FROM sqlite_master WHERE type='table';"
50
- ).fetchall()
51
- ]
52
-
53
- # If we have a db already, perform upgrades and return
54
- if self.db_path != ":memory:" and "documents" in existing_tables:
55
- # Upgrade database
56
- console = Console()
57
- db_version = self.get_user_version()
58
- for version, steps in upgrades:
59
- if parse(current_version) >= parse(version) and parse(version) > parse(
60
- db_version
61
- ):
62
- for step in steps:
63
- step(db)
64
- console.print(
65
- f"[green][b]DB Upgrade: [/b]{step.__doc__}[/green]"
66
- )
67
- return
68
-
69
- # Create documents table
70
- db.execute("""
71
- CREATE TABLE IF NOT EXISTS documents (
72
- id INTEGER PRIMARY KEY AUTOINCREMENT,
73
- content TEXT NOT NULL,
74
- uri TEXT,
75
- metadata TEXT DEFAULT '{}',
76
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
77
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
78
- )
79
- """)
80
- # Create chunks table
81
- db.execute("""
82
- CREATE TABLE IF NOT EXISTS chunks (
83
- id INTEGER PRIMARY KEY AUTOINCREMENT,
84
- document_id INTEGER NOT NULL,
85
- content TEXT NOT NULL,
86
- metadata TEXT DEFAULT '{}',
87
- FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE
88
- )
89
- """)
90
- # Create vector table for chunk embeddings
91
- embedder = get_embedder()
92
- db.execute(f"""
93
- CREATE VIRTUAL TABLE IF NOT EXISTS chunk_embeddings USING vec0(
94
- chunk_id INTEGER PRIMARY KEY,
95
- embedding FLOAT[{embedder._vector_dim}]
94
+ # Get list of existing tables
95
+ existing_tables = self.db.table_names()
96
+
97
+ # Create or get documents table
98
+ if "documents" in existing_tables:
99
+ self.documents_table = self.db.open_table("documents")
100
+ else:
101
+ self.documents_table = self.db.create_table(
102
+ "documents", schema=DocumentRecord
96
103
  )
97
- """)
98
- # Create FTS5 table for full-text search
99
- db.execute("""
100
- CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
101
- content,
102
- content='chunks',
103
- content_rowid='id'
104
+
105
+ # Create or get chunks table
106
+ if "chunks" in existing_tables:
107
+ self.chunks_table = self.db.open_table("chunks")
108
+ else:
109
+ self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
110
+ # Create FTS index on the new table
111
+ self.chunks_table.create_fts_index("content", replace=True)
112
+
113
+ # Create or get settings table
114
+ if "settings" in existing_tables:
115
+ self.settings_table = self.db.open_table("settings")
116
+ else:
117
+ self.settings_table = self.db.create_table(
118
+ "settings", schema=SettingsRecord
104
119
  )
105
- """)
106
- # Create settings table for storing current configuration
107
- db.execute("""
108
- CREATE TABLE IF NOT EXISTS settings (
109
- id INTEGER PRIMARY KEY DEFAULT 1,
110
- settings TEXT NOT NULL DEFAULT '{}'
120
+ # Save current settings to the new database
121
+ settings_data = Config.model_dump(mode="json")
122
+ self.settings_table.add(
123
+ [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
111
124
  )
112
- """)
113
- # Save current settings to the new database
114
- settings_json = Config.model_dump_json()
115
- db.execute(
116
- "INSERT OR IGNORE INTO settings (id, settings) VALUES (1, ?)",
117
- (settings_json,),
118
- )
119
- # Create indexes for better performance
120
- db.execute(
121
- "CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)"
122
- )
123
- db.commit()
124
-
125
- def get_user_version(self) -> str:
126
- """Returns the SQLite user version"""
127
- if self._connection is None:
128
- raise ValueError("Store connection is not available")
129
125
 
130
- cursor = self._connection.execute("PRAGMA user_version;")
131
- version = cursor.fetchone()
132
- return int_to_semantic_version(version[0])
126
+ # Set current version in settings
127
+ current_version = metadata.version("haiku.rag")
128
+ self.set_haiku_version(current_version)
133
129
 
134
- def set_user_version(self, version: str) -> None:
135
- """Updates the SQLite user version"""
136
- if self._connection is None:
137
- raise ValueError("Store connection is not available")
130
+ # Check if we need to perform upgrades
131
+ try:
132
+ existing_settings = list(
133
+ self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
134
+ )
135
+ if existing_settings:
136
+ db_version = self.get_haiku_version() # noqa: F841
137
+ # TODO: Add upgrade logic here similar to SQLite version when needed
138
+ except Exception:
139
+ # Settings table might not exist yet in fresh databases
140
+ pass
141
+
142
+ def get_haiku_version(self) -> str:
143
+ """Returns the user version stored in settings."""
144
+ settings_records = list(
145
+ self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
146
+ )
147
+ if settings_records:
148
+ settings = (
149
+ json.loads(settings_records[0].settings)
150
+ if settings_records[0].settings
151
+ else {}
152
+ )
153
+ return settings.get("version", "0.0.0")
154
+ return "0.0.0"
138
155
 
139
- self._connection.execute(
140
- f"PRAGMA user_version = {semantic_version_to_int(version)};"
156
+ def set_haiku_version(self, version: str) -> None:
157
+ """Updates the user version in settings."""
158
+ settings_records = list(
159
+ self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
141
160
  )
161
+ if settings_records:
162
+ settings = (
163
+ json.loads(settings_records[0].settings)
164
+ if settings_records[0].settings
165
+ else {}
166
+ )
167
+ settings["version"] = version
168
+ # Update the record
169
+ self.settings_table.update(
170
+ where="id = 'settings'", values={"settings": json.dumps(settings)}
171
+ )
172
+ else:
173
+ # Create new settings record
174
+ settings_data = Config.model_dump(mode="json")
175
+ settings_data["version"] = version
176
+ self.settings_table.add(
177
+ [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
178
+ )
142
179
 
143
180
  def recreate_embeddings_table(self) -> None:
144
- """Recreate the embeddings table with current vector dimensions."""
145
- if self._connection is None:
146
- raise ValueError("Store connection is not available")
147
-
148
- # Drop existing embeddings table
149
- self._connection.execute("DROP TABLE IF EXISTS chunk_embeddings")
150
-
151
- # Recreate with current dimensions
152
- embedder = get_embedder()
153
- self._connection.execute(f"""
154
- CREATE VIRTUAL TABLE chunk_embeddings USING vec0(
155
- chunk_id INTEGER PRIMARY KEY,
156
- embedding FLOAT[{embedder._vector_dim}]
157
- )
158
- """)
181
+ """Recreate the chunks table with current vector dimensions."""
182
+ # Drop and recreate chunks table
183
+ try:
184
+ self.db.drop_table("chunks")
185
+ except Exception:
186
+ pass
159
187
 
160
- self._connection.commit()
188
+ # Update the ChunkRecord model with new vector dimension
189
+ self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
190
+ self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
161
191
 
162
- @staticmethod
163
- def serialize_embedding(embedding: list[float]) -> bytes:
164
- """Serialize a list of floats to bytes for sqlite-vec storage."""
165
- return struct.pack(f"{len(embedding)}f", *embedding)
192
+ # Create FTS index on the new table
193
+ self.chunks_table.create_fts_index("content", replace=True)
166
194
 
167
195
  def close(self):
168
- """Close the database connection if it's an in-memory database."""
169
- if self._connection is not None:
170
- self._connection.close()
171
- self._connection = None
196
+ """Close the database connection."""
197
+ # LanceDB connections are automatically managed
198
+ pass
199
+
200
+ @property
201
+ def _connection(self):
202
+ """Compatibility property for repositories expecting _connection."""
203
+ return self
@@ -6,8 +6,8 @@ class Chunk(BaseModel):
6
6
  Represents a chunk with content, metadata, and optional document information.
7
7
  """
8
8
 
9
- id: int | None = None
10
- document_id: int | None = None
9
+ id: str | None = None
10
+ document_id: str | None = None
11
11
  content: str
12
12
  metadata: dict = {}
13
13
  document_uri: str | None = None
@@ -8,7 +8,7 @@ class Document(BaseModel):
8
8
  Represents a document with an ID, content, and metadata.
9
9
  """
10
10
 
11
- id: int | None = None
11
+ id: str | None = None
12
12
  content: str
13
13
  uri: str | None = None
14
14
  metadata: dict = {}
@@ -1,5 +1,9 @@
1
- from haiku.rag.store.repositories.base import BaseRepository
2
1
  from haiku.rag.store.repositories.chunk import ChunkRepository
3
2
  from haiku.rag.store.repositories.document import DocumentRepository
3
+ from haiku.rag.store.repositories.settings import SettingsRepository
4
4
 
5
- __all__ = ["BaseRepository", "DocumentRepository", "ChunkRepository"]
5
+ __all__ = [
6
+ "ChunkRepository",
7
+ "DocumentRepository",
8
+ "SettingsRepository",
9
+ ]