PyPI - haiku.rag - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

haiku.rag 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of haiku.rag might be problematic. Click here for more details.

Files changed (26) hide show

haiku/rag/app.py +4 -4
haiku/rag/cli.py +38 -27
haiku/rag/client.py +19 -23
haiku/rag/config.py +6 -2
haiku/rag/logging.py +4 -0
haiku/rag/mcp.py +12 -9
haiku/rag/migration.py +316 -0
haiku/rag/reranking/__init__.py +0 -6
haiku/rag/store/engine.py +173 -141
haiku/rag/store/models/chunk.py +2 -2
haiku/rag/store/models/document.py +1 -1
haiku/rag/store/repositories/__init__.py +6 -2
haiku/rag/store/repositories/chunk.py +279 -414
haiku/rag/store/repositories/document.py +171 -205
haiku/rag/store/repositories/settings.py +115 -49
haiku/rag/store/upgrades/__init__.py +1 -3
haiku/rag/utils.py +39 -31
{haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/METADATA +21 -16
haiku_rag-0.7.0.dist-info/RECORD +39 -0
haiku/rag/reranking/ollama.py +0 -81
haiku/rag/store/repositories/base.py +0 -40
haiku/rag/store/upgrades/v0_3_4.py +0 -26
haiku_rag-0.6.0.dist-info/RECORD +0 -41
{haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/WHEEL +0 -0
{haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/entry_points.txt +0 -0
{haiku_rag-0.6.0.dist-info → haiku_rag-0.7.0.dist-info}/licenses/LICENSE +0 -0

haiku/rag/migration.py ADDED Viewed

@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+"""
+Migration script to migrate from SQLite to LanceDB.
+This script will:
+1. Read data from an existing SQLite database
+2. Create a new LanceDB database with the same data
+3. Preserve all documents, chunks, embeddings, and settings
+"""
+import json
+import sqlite3
+import struct
+from pathlib import Path
+from uuid import uuid4
+from rich.console import Console
+from rich.progress import Progress, TaskID
+from haiku.rag.store.engine import Store
+def deserialize_sqlite_embedding(data: bytes) -> list[float]:
+    """Deserialize sqlite-vec embedding from bytes."""
+    if not data:
+        return []
+    # sqlite-vec stores embeddings as float32 arrays
+    num_floats = len(data) // 4
+    return list(struct.unpack(f"{num_floats}f", data))
+class SQLiteToLanceDBMigrator:
+    """Migrates data from SQLite to LanceDB."""
+    def __init__(self, sqlite_path: Path, lancedb_path: Path):
+        self.sqlite_path = sqlite_path
+        self.lancedb_path = lancedb_path
+        self.console = Console()
+    def migrate(self) -> bool:
+        """Perform the migration."""
+        try:
+            self.console.print(
+                f"[blue]Starting migration from {self.sqlite_path} to {self.lancedb_path}[/blue]"
+            )
+            # Check if SQLite database exists
+            if not self.sqlite_path.exists():
+                self.console.print(
+                    f"[red]SQLite database not found: {self.sqlite_path}[/red]"
+                )
+                return False
+            # Connect to SQLite database
+            sqlite_conn = sqlite3.connect(self.sqlite_path)
+            sqlite_conn.row_factory = sqlite3.Row
+            # Create LanceDB store
+            lance_store = Store(self.lancedb_path, skip_validation=True)
+            with Progress() as progress:
+                # Migrate documents
+                doc_task = progress.add_task(
+                    "[green]Migrating documents...", total=None
+                )
+                document_id_mapping = self._migrate_documents(
+                    sqlite_conn, lance_store, progress, doc_task
+                )
+                # Migrate chunks and embeddings
+                chunk_task = progress.add_task(
+                    "[yellow]Migrating chunks and embeddings...", total=None
+                )
+                self._migrate_chunks(
+                    sqlite_conn, lance_store, progress, chunk_task, document_id_mapping
+                )
+                # Migrate settings
+                settings_task = progress.add_task(
+                    "[blue]Migrating settings...", total=None
+                )
+                self._migrate_settings(
+                    sqlite_conn, lance_store, progress, settings_task
+                )
+            sqlite_conn.close()
+            # Optimize the chunks table after migration
+            self.console.print("[blue]Optimizing LanceDB...[/blue]")
+            try:
+                lance_store.chunks_table.optimize()
+                self.console.print("[green]✅ Optimization completed[/green]")
+            except Exception as e:
+                self.console.print(
+                    f"[yellow]Warning: Optimization failed: {e}[/yellow]"
+                )
+            lance_store.close()
+            self.console.print("[green]✅ Migration completed successfully![/green]")
+            self.console.print(
+                f"[green]✅ Migrated {len(document_id_mapping)} documents[/green]"
+            )
+            return True
+        except Exception as e:
+            self.console.print(f"[red]❌ Migration failed: {e}[/red]")
+            import traceback
+            self.console.print(f"[red]{traceback.format_exc()}[/red]")
+            return False
+    def _migrate_documents(
+        self,
+        sqlite_conn: sqlite3.Connection,
+        lance_store: Store,
+        progress: Progress,
+        task: TaskID,
+    ) -> dict[int, str]:
+        """Migrate documents from SQLite to LanceDB and return ID mapping."""
+        cursor = sqlite_conn.cursor()
+        cursor.execute(
+            "SELECT id, content, uri, metadata, created_at, updated_at FROM documents ORDER BY id"
+        )
+        documents = []
+        id_mapping = {}  # Maps old integer ID to new UUID
+        for row in cursor.fetchall():
+            new_uuid = str(uuid4())
+            id_mapping[row["id"]] = new_uuid
+            doc_data = {
+                "id": new_uuid,
+                "content": row["content"],
+                "uri": row["uri"],
+                "metadata": json.loads(row["metadata"]) if row["metadata"] else {},
+                "created_at": row["created_at"],
+                "updated_at": row["updated_at"],
+            }
+            documents.append(doc_data)
+        # Batch insert documents to LanceDB
+        if documents:
+            from haiku.rag.store.engine import DocumentRecord
+            doc_records = [
+                DocumentRecord(
+                    id=doc["id"],
+                    content=doc["content"],
+                    uri=doc["uri"],
+                    metadata=json.dumps(doc["metadata"]),
+                    created_at=doc["created_at"],
+                    updated_at=doc["updated_at"],
+                )
+                for doc in documents
+            ]
+            lance_store.documents_table.add(doc_records)
+        progress.update(task, completed=len(documents), total=len(documents))
+        return id_mapping
+    def _migrate_chunks(
+        self,
+        sqlite_conn: sqlite3.Connection,
+        lance_store: Store,
+        progress: Progress,
+        task: TaskID,
+        document_id_mapping: dict[int, str],
+    ):
+        """Migrate chunks and embeddings from SQLite to LanceDB."""
+        cursor = sqlite_conn.cursor()
+        # Get chunks first
+        cursor.execute("""
+            SELECT id, document_id, content, metadata
+            FROM chunks
+            ORDER BY id
+        """)
+        chunks_data = cursor.fetchall()
+        # Get embeddings separately to avoid vec0 virtual table issues
+        embeddings_map = {}
+        try:
+            # Try to get embeddings from the vec0 tables directly
+            cursor.execute("""
+                SELECT
+                    r.chunk_id,
+                    v.vectors
+                FROM chunk_embeddings_rowids r
+                JOIN chunk_embeddings_vector_chunks00 v ON r.rowid = v.rowid
+            """)
+            for row in cursor.fetchall():
+                chunk_id = row[0]
+                vectors_blob = row[1]
+                if vectors_blob and chunk_id not in embeddings_map:
+                    embeddings_map[chunk_id] = vectors_blob
+        except sqlite3.OperationalError as e:
+            self.console.print(
+                f"[yellow]Warning: Could not extract embeddings: {e}[/yellow]"
+            )
+            self.console.print(
+                "[yellow]Continuing migration without embeddings...[/yellow]"
+            )
+        chunks = []
+        for row in chunks_data:
+            # Generate new UUID for chunk
+            chunk_uuid = str(uuid4())
+            # Map the old document_id to new UUID
+            document_uuid = document_id_mapping.get(row["document_id"])
+            if not document_uuid:
+                self.console.print(
+                    f"[yellow]Warning: Document ID {row['document_id']} not found in mapping for chunk {row['id']}[/yellow]"
+                )
+                continue
+            # Get embedding for this chunk
+            embedding = []
+            embedding_blob = embeddings_map.get(row["id"])
+            if embedding_blob:
+                try:
+                    embedding = deserialize_sqlite_embedding(embedding_blob)
+                except Exception as e:
+                    self.console.print(
+                        f"[yellow]Warning: Failed to deserialize embedding for chunk {row['id']}: {e}[/yellow]"
+                    )
+                    # Generate a zero vector of the expected dimension
+                    embedding = [0.0] * lance_store.embedder._vector_dim
+            else:
+                # No embedding found, generate zero vector
+                embedding = [0.0] * lance_store.embedder._vector_dim
+            chunk_data = {
+                "id": chunk_uuid,
+                "document_id": document_uuid,
+                "content": row["content"],
+                "metadata": json.loads(row["metadata"]) if row["metadata"] else {},
+                "vector": embedding,
+            }
+            chunks.append(chunk_data)
+        # Batch insert chunks to LanceDB
+        if chunks:
+            chunk_records = [
+                lance_store.ChunkRecord(
+                    id=chunk["id"],
+                    document_id=chunk["document_id"],
+                    content=chunk["content"],
+                    metadata=json.dumps(chunk["metadata"]),
+                    vector=chunk["vector"],
+                )
+                for chunk in chunks
+            ]
+            lance_store.chunks_table.add(chunk_records)
+        progress.update(task, completed=len(chunks), total=len(chunks))
+    def _migrate_settings(
+        self,
+        sqlite_conn: sqlite3.Connection,
+        lance_store: Store,
+        progress: Progress,
+        task: TaskID,
+    ):
+        """Migrate settings from SQLite to LanceDB."""
+        cursor = sqlite_conn.cursor()
+        try:
+            cursor.execute("SELECT id, settings FROM settings WHERE id = 1")
+            row = cursor.fetchone()
+            if row:
+                settings_data = json.loads(row["settings"]) if row["settings"] else {}
+                # Update the existing settings in LanceDB (use string ID)
+                lance_store.settings_table.update(
+                    where="id = 'settings'",
+                    values={"settings": json.dumps(settings_data)},
+                )
+                progress.update(task, completed=1, total=1)
+            else:
+                progress.update(task, completed=0, total=0)
+        except sqlite3.OperationalError:
+            # Settings table doesn't exist in old SQLite database
+            self.console.print(
+                "[yellow]No settings table found in SQLite database[/yellow]"
+            )
+            progress.update(task, completed=0, total=0)
+async def migrate_sqlite_to_lancedb(
+    sqlite_path: Path, lancedb_path: Path | None = None
+) -> bool:
+    """
+    Migrate an existing SQLite database to LanceDB.
+    Args:
+        sqlite_path: Path to the existing SQLite database
+        lancedb_path: Path for the new LanceDB database (optional, will auto-generate if not provided)
+    Returns:
+        True if migration was successful, False otherwise
+    """
+    if lancedb_path is None:
+        # Auto-generate LanceDB path
+        lancedb_path = sqlite_path.parent / (sqlite_path.stem + ".lancedb")
+    migrator = SQLiteToLanceDBMigrator(sqlite_path, lancedb_path)
+    return migrator.migrate()

haiku/rag/reranking/__init__.py CHANGED Viewed

@@ -31,10 +31,4 @@ def get_reranker() -> RerankerBase | None:
         except ImportError:
             return None
-    if Config.RERANK_PROVIDER == "ollama":
-        from haiku.rag.reranking.ollama import OllamaReranker
-        _reranker = OllamaReranker()
-        return _reranker
     return None

haiku/rag/store/engine.py CHANGED Viewed

@@ -1,171 +1,203 @@
-import sqlite3
-import struct
+import json
+import logging
 from importlib import metadata
 from pathlib import Path
-from typing import Literal
+from uuid import uuid4
-import sqlite_vec
-from packaging.version import parse
-from rich.console import Console
+import lancedb
+from lancedb.pydantic import LanceModel, Vector
+from pydantic import Field
 from haiku.rag.config import Config
 from haiku.rag.embeddings import get_embedder
-from haiku.rag.store.upgrades import upgrades
-from haiku.rag.utils import int_to_semantic_version, semantic_version_to_int
+logger = logging.getLogger(__name__)
+class DocumentRecord(LanceModel):
+    id: str = Field(default_factory=lambda: str(uuid4()))
+    content: str
+    uri: str | None = None
+    metadata: str = Field(default="{}")
+    created_at: str = Field(default_factory=lambda: "")
+    updated_at: str = Field(default_factory=lambda: "")
+def create_chunk_model(vector_dim: int):
+    """Create a ChunkRecord model with the specified vector dimension.
+    This creates a model with proper vector typing for LanceDB.
+    """
+    class ChunkRecord(LanceModel):
+        id: str = Field(default_factory=lambda: str(uuid4()))
+        document_id: str
+        content: str
+        metadata: str = Field(default="{}")
+        vector: Vector(vector_dim) = Field(default_factory=lambda: [0.0] * vector_dim)  # type: ignore
+    return ChunkRecord
+class SettingsRecord(LanceModel):
+    id: str = Field(default="settings")
+    settings: str = Field(default="{}")
 class Store:
-    def __init__(
-        self, db_path: Path | Literal[":memory:"], skip_validation: bool = False
-    ):
-        self.db_path: Path | Literal[":memory:"] = db_path
+    def __init__(self, db_path: Path, skip_validation: bool = False):
+        self.db_path: Path = db_path
+        self.embedder = get_embedder()
+        # Create the ChunkRecord model with the correct vector dimension
+        self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
+        # Connect to LanceDB
+        self.db = self._connect_to_lancedb(db_path)
+        # Initialize tables
         self.create_or_update_db()
         # Validate config compatibility after connection is established
         if not skip_validation:
-            from haiku.rag.store.repositories.settings import SettingsRepository
+            self._validate_configuration()
+    def _connect_to_lancedb(self, db_path: Path):
+        """Establish connection to LanceDB (local, cloud, or object storage)."""
+        # Check if we have cloud configuration
+        if self._has_cloud_config():
+            return lancedb.connect(
+                uri=Config.LANCEDB_URI,
+                api_key=Config.LANCEDB_API_KEY,
+                region=Config.LANCEDB_REGION,
+            )
+        else:
+            # Local file system connection
+            return lancedb.connect(db_path)
+    def _has_cloud_config(self) -> bool:
+        """Check if cloud configuration is complete."""
+        return bool(
+            Config.LANCEDB_URI and Config.LANCEDB_API_KEY and Config.LANCEDB_REGION
+        )
-            settings_repo = SettingsRepository(self)
-            settings_repo.validate_config_compatibility()
-        current_version = metadata.version("haiku.rag")
-        self.set_user_version(current_version)
+    def _validate_configuration(self) -> None:
+        """Validate that the configuration is compatible with the database."""
+        from haiku.rag.store.repositories.settings import SettingsRepository
+        settings_repo = SettingsRepository(self)
+        settings_repo.validate_config_compatibility()
     def create_or_update_db(self):
-        """Create the database and tables with sqlite-vec support for embeddings."""
-        current_version = metadata.version("haiku.rag")
+        """Create the database tables."""
-        db = sqlite3.connect(self.db_path)
-        db.enable_load_extension(True)
-        sqlite_vec.load(db)
-        # Enable WAL mode for better concurrency (skip for in-memory databases)
-        if self.db_path != ":memory:":
-            db.execute("PRAGMA journal_mode=WAL")
-        self._connection = db
-        existing_tables = [
-            row[0]
-            for row in db.execute(
-                "SELECT name FROM sqlite_master WHERE type='table';"
-            ).fetchall()
-        ]
-        # If we have a db already, perform upgrades and return
-        if self.db_path != ":memory:" and "documents" in existing_tables:
-            # Upgrade database
-            console = Console()
-            db_version = self.get_user_version()
-            for version, steps in upgrades:
-                if parse(current_version) >= parse(version) and parse(version) > parse(
-                    db_version
-                ):
-                    for step in steps:
-                        step(db)
-                        console.print(
-                            f"[green][b]DB Upgrade: [/b]{step.__doc__}[/green]"
-                        )
-            return
-        # Create documents table
-        db.execute("""
-            CREATE TABLE IF NOT EXISTS documents (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                content TEXT NOT NULL,
-                uri TEXT,
-                metadata TEXT DEFAULT '{}',
-                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-            )
-        """)
-        # Create chunks table
-        db.execute("""
-            CREATE TABLE IF NOT EXISTS chunks (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                document_id INTEGER NOT NULL,
-                content TEXT NOT NULL,
-                metadata TEXT DEFAULT '{}',
-                FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE
-            )
-        """)
-        # Create vector table for chunk embeddings
-        embedder = get_embedder()
-        db.execute(f"""
-            CREATE VIRTUAL TABLE IF NOT EXISTS chunk_embeddings USING vec0(
-                chunk_id INTEGER PRIMARY KEY,
-                embedding FLOAT[{embedder._vector_dim}]
+        # Get list of existing tables
+        existing_tables = self.db.table_names()
+        # Create or get documents table
+        if "documents" in existing_tables:
+            self.documents_table = self.db.open_table("documents")
+        else:
+            self.documents_table = self.db.create_table(
+                "documents", schema=DocumentRecord
             )
-        """)
-        # Create FTS5 table for full-text search
-        db.execute("""
-            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
-                content,
-                content='chunks',
-                content_rowid='id'
+        # Create or get chunks table
+        if "chunks" in existing_tables:
+            self.chunks_table = self.db.open_table("chunks")
+        else:
+            self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
+            # Create FTS index on the new table
+            self.chunks_table.create_fts_index("content", replace=True)
+        # Create or get settings table
+        if "settings" in existing_tables:
+            self.settings_table = self.db.open_table("settings")
+        else:
+            self.settings_table = self.db.create_table(
+                "settings", schema=SettingsRecord
             )
-        """)
-        # Create settings table for storing current configuration
-        db.execute("""
-            CREATE TABLE IF NOT EXISTS settings (
-                id INTEGER PRIMARY KEY DEFAULT 1,
-                settings TEXT NOT NULL DEFAULT '{}'
+            # Save current settings to the new database
+            settings_data = Config.model_dump(mode="json")
+            self.settings_table.add(
+                [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
             )
-        """)
-        # Save current settings to the new database
-        settings_json = Config.model_dump_json()
-        db.execute(
-            "INSERT OR IGNORE INTO settings (id, settings) VALUES (1, ?)",
-            (settings_json,),
-        )
-        # Create indexes for better performance
-        db.execute(
-            "CREATE INDEX IF NOT EXISTS idx_chunks_document_id ON chunks(document_id)"
-        )
-        db.commit()
-    def get_user_version(self) -> str:
-        """Returns the SQLite user version"""
-        if self._connection is None:
-            raise ValueError("Store connection is not available")
-        cursor = self._connection.execute("PRAGMA user_version;")
-        version = cursor.fetchone()
-        return int_to_semantic_version(version[0])
+        # Set current version in settings
+        current_version = metadata.version("haiku.rag")
+        self.set_haiku_version(current_version)
-    def set_user_version(self, version: str) -> None:
-        """Updates the SQLite user version"""
-        if self._connection is None:
-            raise ValueError("Store connection is not available")
+        # Check if we need to perform upgrades
+        try:
+            existing_settings = list(
+                self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
+            )
+            if existing_settings:
+                db_version = self.get_haiku_version()  # noqa: F841
+                # TODO: Add upgrade logic here similar to SQLite version when needed
+        except Exception:
+            # Settings table might not exist yet in fresh databases
+            pass
+    def get_haiku_version(self) -> str:
+        """Returns the user version stored in settings."""
+        settings_records = list(
+            self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
+        )
+        if settings_records:
+            settings = (
+                json.loads(settings_records[0].settings)
+                if settings_records[0].settings
+                else {}
+            )
+            return settings.get("version", "0.0.0")
+        return "0.0.0"
-        self._connection.execute(
-            f"PRAGMA user_version = {semantic_version_to_int(version)};"
+    def set_haiku_version(self, version: str) -> None:
+        """Updates the user version in settings."""
+        settings_records = list(
+            self.settings_table.search().limit(1).to_pydantic(SettingsRecord)
         )
+        if settings_records:
+            settings = (
+                json.loads(settings_records[0].settings)
+                if settings_records[0].settings
+                else {}
+            )
+            settings["version"] = version
+            # Update the record
+            self.settings_table.update(
+                where="id = 'settings'", values={"settings": json.dumps(settings)}
+            )
+        else:
+            # Create new settings record
+            settings_data = Config.model_dump(mode="json")
+            settings_data["version"] = version
+            self.settings_table.add(
+                [SettingsRecord(id="settings", settings=json.dumps(settings_data))]
+            )
     def recreate_embeddings_table(self) -> None:
-        """Recreate the embeddings table with current vector dimensions."""
-        if self._connection is None:
-            raise ValueError("Store connection is not available")
-        # Drop existing embeddings table
-        self._connection.execute("DROP TABLE IF EXISTS chunk_embeddings")
-        # Recreate with current dimensions
-        embedder = get_embedder()
-        self._connection.execute(f"""
-            CREATE VIRTUAL TABLE chunk_embeddings USING vec0(
-                chunk_id INTEGER PRIMARY KEY,
-                embedding FLOAT[{embedder._vector_dim}]
-            )
-        """)
+        """Recreate the chunks table with current vector dimensions."""
+        # Drop and recreate chunks table
+        try:
+            self.db.drop_table("chunks")
+        except Exception:
+            pass
-        self._connection.commit()
+        # Update the ChunkRecord model with new vector dimension
+        self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
+        self.chunks_table = self.db.create_table("chunks", schema=self.ChunkRecord)
-    @staticmethod
-    def serialize_embedding(embedding: list[float]) -> bytes:
-        """Serialize a list of floats to bytes for sqlite-vec storage."""
-        return struct.pack(f"{len(embedding)}f", *embedding)
+        # Create FTS index on the new table
+        self.chunks_table.create_fts_index("content", replace=True)
     def close(self):
-        """Close the database connection if it's an in-memory database."""
-        if self._connection is not None:
-            self._connection.close()
-            self._connection = None
+        """Close the database connection."""
+        # LanceDB connections are automatically managed
+        pass
+    @property
+    def _connection(self):
+        """Compatibility property for repositories expecting _connection."""
+        return self

haiku/rag/store/models/chunk.py CHANGED Viewed

@@ -6,8 +6,8 @@ class Chunk(BaseModel):
     Represents a chunk with content, metadata, and optional document information.
     """
-    id: int | None = None
-    document_id: int | None = None
+    id: str | None = None
+    document_id: str | None = None
     content: str
     metadata: dict = {}
     document_uri: str | None = None

haiku.rag 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

Potentially problematic release.

haiku.rag 0.6.0py3-none-any.whl → 0.7.0py3-none-any.whl