PyPI - keep-skill - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

keep-skill 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

keep/__init__.py +3 -6
keep/api.py +793 -141
keep/cli.py +467 -129
keep/config.py +172 -41
keep/context.py +1 -125
keep/document_store.py +569 -0
keep/errors.py +33 -0
keep/indexing.py +1 -1
keep/logging_config.py +34 -3
keep/paths.py +81 -17
keep/pending_summaries.py +46 -40
keep/providers/embedding_cache.py +53 -46
keep/providers/embeddings.py +43 -13
keep/providers/mlx.py +23 -21
keep/store.py +58 -14
{keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/METADATA +29 -15
keep_skill-0.2.0.dist-info/RECORD +28 -0
keep_skill-0.1.0.dist-info/RECORD +0 -26
{keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/WHEEL +0 -0
{keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/entry_points.txt +0 -0
{keep_skill-0.1.0.dist-info → keep_skill-0.2.0.dist-info}/licenses/LICENSE +0 -0

keep/document_store.py ADDED Viewed

@@ -0,0 +1,569 @@
+"""
+Document store using SQLite.
+Stores canonical document records separate from embeddings.
+This enables multiple embedding providers to index the same documents.
+The document store is the source of truth for:
+- Document identity (URI / custom ID)
+- Summary text
+- Tags (source + system)
+- Timestamps
+Embeddings are stored in ChromaDB collections, keyed by embedding provider.
+"""
+import json
+import sqlite3
+import threading
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional
+@dataclass
+class DocumentRecord:
+    """
+    A canonical document record.
+    This is the source of truth, independent of any embedding index.
+    """
+    id: str
+    collection: str
+    summary: str
+    tags: dict[str, str]
+    created_at: str
+    updated_at: str
+    content_hash: Optional[str] = None
+class DocumentStore:
+    """
+    SQLite-backed store for canonical document records.
+    Separates document metadata from embedding storage, enabling:
+    - Multiple embedding providers per document
+    - Efficient tag/metadata queries without ChromaDB
+    - Clear separation of concerns
+    """
+    def __init__(self, store_path: Path):
+        """
+        Args:
+            store_path: Path to SQLite database file
+        """
+        self._db_path = store_path
+        self._conn: Optional[sqlite3.Connection] = None
+        self._lock = threading.Lock()
+        self._init_db()
+    def _init_db(self) -> None:
+        """Initialize the SQLite database."""
+        self._db_path.parent.mkdir(parents=True, exist_ok=True)
+        self._conn = sqlite3.connect(str(self._db_path), check_same_thread=False)
+        self._conn.row_factory = sqlite3.Row
+        self._conn.execute("""
+            CREATE TABLE IF NOT EXISTS documents (
+                id TEXT NOT NULL,
+                collection TEXT NOT NULL,
+                summary TEXT NOT NULL,
+                tags_json TEXT NOT NULL DEFAULT '{}',
+                created_at TEXT NOT NULL,
+                updated_at TEXT NOT NULL,
+                content_hash TEXT,
+                PRIMARY KEY (id, collection)
+            )
+        """)
+        # Migration: add content_hash column if missing (for existing databases)
+        cursor = self._conn.execute("PRAGMA table_info(documents)")
+        columns = {row[1] for row in cursor.fetchall()}
+        if "content_hash" not in columns:
+            self._conn.execute("ALTER TABLE documents ADD COLUMN content_hash TEXT")
+        # Index for collection queries
+        self._conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_documents_collection
+            ON documents(collection)
+        """)
+        # Index for timestamp queries
+        self._conn.execute("""
+            CREATE INDEX IF NOT EXISTS idx_documents_updated
+            ON documents(updated_at)
+        """)
+        self._conn.commit()
+    def _now(self) -> str:
+        """Current timestamp in ISO format."""
+        return datetime.now(timezone.utc).isoformat()
+    def _get_unlocked(self, collection: str, id: str) -> Optional[DocumentRecord]:
+        """Get a document by ID without acquiring the lock (for use within locked contexts)."""
+        cursor = self._conn.execute("""
+            SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
+            FROM documents
+            WHERE id = ? AND collection = ?
+        """, (id, collection))
+        row = cursor.fetchone()
+        if row is None:
+            return None
+        return DocumentRecord(
+            id=row["id"],
+            collection=row["collection"],
+            summary=row["summary"],
+            tags=json.loads(row["tags_json"]),
+            created_at=row["created_at"],
+            updated_at=row["updated_at"],
+            content_hash=row["content_hash"],
+        )
+    # -------------------------------------------------------------------------
+    # Write Operations
+    # -------------------------------------------------------------------------
+    def upsert(
+        self,
+        collection: str,
+        id: str,
+        summary: str,
+        tags: dict[str, str],
+        content_hash: Optional[str] = None,
+    ) -> DocumentRecord:
+        """
+        Insert or update a document record.
+        Preserves created_at on update. Updates updated_at always.
+        Args:
+            collection: Collection name
+            id: Document identifier (URI or custom)
+            summary: Document summary text
+            tags: All tags (source + system)
+            content_hash: SHA256 hash of content (for change detection)
+        Returns:
+            The stored DocumentRecord
+        """
+        now = self._now()
+        tags_json = json.dumps(tags, ensure_ascii=False)
+        with self._lock:
+            # Check if exists to preserve created_at
+            existing = self._get_unlocked(collection, id)
+            created_at = existing.created_at if existing else now
+            self._conn.execute("""
+                INSERT OR REPLACE INTO documents
+                (id, collection, summary, tags_json, created_at, updated_at, content_hash)
+                VALUES (?, ?, ?, ?, ?, ?, ?)
+            """, (id, collection, summary, tags_json, created_at, now, content_hash))
+            self._conn.commit()
+        return DocumentRecord(
+            id=id,
+            collection=collection,
+            summary=summary,
+            tags=tags,
+            created_at=created_at,
+            updated_at=now,
+            content_hash=content_hash,
+        )
+    def update_summary(self, collection: str, id: str, summary: str) -> bool:
+        """
+        Update just the summary of an existing document.
+        Used by lazy summarization to replace placeholder summaries.
+        Args:
+            collection: Collection name
+            id: Document identifier
+            summary: New summary text
+        Returns:
+            True if document was found and updated, False otherwise
+        """
+        now = self._now()
+        with self._lock:
+            cursor = self._conn.execute("""
+                UPDATE documents
+                SET summary = ?, updated_at = ?
+                WHERE id = ? AND collection = ?
+            """, (summary, now, id, collection))
+            self._conn.commit()
+        return cursor.rowcount > 0
+    def update_tags(
+        self,
+        collection: str,
+        id: str,
+        tags: dict[str, str],
+    ) -> bool:
+        """
+        Update tags of an existing document.
+        Args:
+            collection: Collection name
+            id: Document identifier
+            tags: New tags dict (replaces existing)
+        Returns:
+            True if document was found and updated, False otherwise
+        """
+        now = self._now()
+        tags_json = json.dumps(tags, ensure_ascii=False)
+        with self._lock:
+            cursor = self._conn.execute("""
+                UPDATE documents
+                SET tags_json = ?, updated_at = ?
+                WHERE id = ? AND collection = ?
+            """, (tags_json, now, id, collection))
+            self._conn.commit()
+        return cursor.rowcount > 0
+    def delete(self, collection: str, id: str) -> bool:
+        """
+        Delete a document record.
+        Args:
+            collection: Collection name
+            id: Document identifier
+        Returns:
+            True if document existed and was deleted
+        """
+        with self._lock:
+            cursor = self._conn.execute("""
+                DELETE FROM documents
+                WHERE id = ? AND collection = ?
+            """, (id, collection))
+            self._conn.commit()
+        return cursor.rowcount > 0
+    # -------------------------------------------------------------------------
+    # Read Operations
+    # -------------------------------------------------------------------------
+    def get(self, collection: str, id: str) -> Optional[DocumentRecord]:
+        """
+        Get a document by ID.
+        Args:
+            collection: Collection name
+            id: Document identifier
+        Returns:
+            DocumentRecord if found, None otherwise
+        """
+        cursor = self._conn.execute("""
+            SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
+            FROM documents
+            WHERE id = ? AND collection = ?
+        """, (id, collection))
+        row = cursor.fetchone()
+        if row is None:
+            return None
+        return DocumentRecord(
+            id=row["id"],
+            collection=row["collection"],
+            summary=row["summary"],
+            tags=json.loads(row["tags_json"]),
+            created_at=row["created_at"],
+            updated_at=row["updated_at"],
+            content_hash=row["content_hash"],
+        )
+    def get_many(
+        self,
+        collection: str,
+        ids: list[str],
+    ) -> dict[str, DocumentRecord]:
+        """
+        Get multiple documents by ID.
+        Args:
+            collection: Collection name
+            ids: List of document identifiers
+        Returns:
+            Dict mapping id → DocumentRecord (missing IDs omitted)
+        """
+        if not ids:
+            return {}
+        placeholders = ",".join("?" * len(ids))
+        cursor = self._conn.execute(f"""
+            SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
+            FROM documents
+            WHERE collection = ? AND id IN ({placeholders})
+        """, (collection, *ids))
+        results = {}
+        for row in cursor:
+            results[row["id"]] = DocumentRecord(
+                id=row["id"],
+                collection=row["collection"],
+                summary=row["summary"],
+                tags=json.loads(row["tags_json"]),
+                created_at=row["created_at"],
+                updated_at=row["updated_at"],
+                content_hash=row["content_hash"],
+            )
+        return results
+    def exists(self, collection: str, id: str) -> bool:
+        """Check if a document exists."""
+        cursor = self._conn.execute("""
+            SELECT 1 FROM documents
+            WHERE id = ? AND collection = ?
+        """, (id, collection))
+        return cursor.fetchone() is not None
+    def list_ids(
+        self,
+        collection: str,
+        limit: Optional[int] = None,
+    ) -> list[str]:
+        """
+        List document IDs in a collection.
+        Args:
+            collection: Collection name
+            limit: Maximum number to return (None for all)
+        Returns:
+            List of document IDs
+        """
+        if limit:
+            cursor = self._conn.execute("""
+                SELECT id FROM documents
+                WHERE collection = ?
+                ORDER BY updated_at DESC
+                LIMIT ?
+            """, (collection, limit))
+        else:
+            cursor = self._conn.execute("""
+                SELECT id FROM documents
+                WHERE collection = ?
+                ORDER BY updated_at DESC
+            """, (collection,))
+        return [row["id"] for row in cursor]
+    def count(self, collection: str) -> int:
+        """Count documents in a collection."""
+        cursor = self._conn.execute("""
+            SELECT COUNT(*) FROM documents
+            WHERE collection = ?
+        """, (collection,))
+        return cursor.fetchone()[0]
+    def count_all(self) -> int:
+        """Count total documents across all collections."""
+        cursor = self._conn.execute("SELECT COUNT(*) FROM documents")
+        return cursor.fetchone()[0]
+    def query_by_id_prefix(
+        self,
+        collection: str,
+        prefix: str,
+    ) -> list[DocumentRecord]:
+        """
+        Query documents by ID prefix.
+        Args:
+            collection: Collection name
+            prefix: ID prefix to match (e.g., "_system:")
+        Returns:
+            List of matching DocumentRecords
+        """
+        cursor = self._conn.execute("""
+            SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
+            FROM documents
+            WHERE collection = ? AND id LIKE ?
+            ORDER BY id
+        """, (collection, f"{prefix}%"))
+        results = []
+        for row in cursor:
+            results.append(DocumentRecord(
+                id=row["id"],
+                collection=row["collection"],
+                summary=row["summary"],
+                tags=json.loads(row["tags_json"]),
+                created_at=row["created_at"],
+                updated_at=row["updated_at"],
+                content_hash=row["content_hash"],
+            ))
+        return results
+    # -------------------------------------------------------------------------
+    # Tag Queries
+    # -------------------------------------------------------------------------
+    def list_distinct_tag_keys(self, collection: str) -> list[str]:
+        """
+        List all distinct tag keys used in the collection.
+        Excludes system tags (prefixed with _).
+        Returns:
+            Sorted list of distinct tag keys
+        """
+        cursor = self._conn.execute("""
+            SELECT tags_json FROM documents
+            WHERE collection = ?
+        """, (collection,))
+        keys: set[str] = set()
+        for row in cursor:
+            tags = json.loads(row["tags_json"])
+            for key in tags:
+                if not key.startswith("_"):
+                    keys.add(key)
+        return sorted(keys)
+    def list_distinct_tag_values(self, collection: str, key: str) -> list[str]:
+        """
+        List all distinct values for a given tag key.
+        Args:
+            collection: Collection name
+            key: Tag key to get values for
+        Returns:
+            Sorted list of distinct values
+        """
+        cursor = self._conn.execute("""
+            SELECT tags_json FROM documents
+            WHERE collection = ?
+        """, (collection,))
+        values: set[str] = set()
+        for row in cursor:
+            tags = json.loads(row["tags_json"])
+            if key in tags:
+                values.add(tags[key])
+        return sorted(values)
+    def query_by_tag_key(
+        self,
+        collection: str,
+        key: str,
+        limit: int = 100,
+        since_date: Optional[str] = None,
+    ) -> list[DocumentRecord]:
+        """
+        Find documents that have a specific tag key (any value).
+        Args:
+            collection: Collection name
+            key: Tag key to search for
+            limit: Maximum results
+            since_date: Only include items updated on or after this date (YYYY-MM-DD)
+        Returns:
+            List of matching DocumentRecords
+        """
+        # SQLite JSON functions for tag key existence
+        # json_extract returns NULL if key doesn't exist
+        params: list[Any] = [collection, f"$.{key}"]
+        sql = """
+            SELECT id, collection, summary, tags_json, created_at, updated_at, content_hash
+            FROM documents
+            WHERE collection = ?
+              AND json_extract(tags_json, ?) IS NOT NULL
+        """
+        if since_date is not None:
+            # Compare against the date portion of updated_at
+            sql += "  AND updated_at >= ?\n"
+            params.append(since_date)
+        sql += "ORDER BY updated_at DESC\nLIMIT ?"
+        params.append(limit)
+        cursor = self._conn.execute(sql, params)
+        results = []
+        for row in cursor:
+            results.append(DocumentRecord(
+                id=row["id"],
+                collection=row["collection"],
+                summary=row["summary"],
+                tags=json.loads(row["tags_json"]),
+                created_at=row["created_at"],
+                updated_at=row["updated_at"],
+                content_hash=row["content_hash"],
+            ))
+        return results
+    # -------------------------------------------------------------------------
+    # Collection Management
+    # -------------------------------------------------------------------------
+    def list_collections(self) -> list[str]:
+        """List all collection names."""
+        cursor = self._conn.execute("""
+            SELECT DISTINCT collection FROM documents
+            ORDER BY collection
+        """)
+        return [row["collection"] for row in cursor]
+    def delete_collection(self, collection: str) -> int:
+        """
+        Delete all documents in a collection.
+        Args:
+            collection: Collection name
+        Returns:
+            Number of documents deleted
+        """
+        with self._lock:
+            cursor = self._conn.execute("""
+                DELETE FROM documents
+                WHERE collection = ?
+            """, (collection,))
+            self._conn.commit()
+        return cursor.rowcount
+    # -------------------------------------------------------------------------
+    # Lifecycle
+    # -------------------------------------------------------------------------
+    def close(self) -> None:
+        """Close the database connection."""
+        if self._conn is not None:
+            self._conn.close()
+            self._conn = None
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return False
+    def __del__(self):
+        self.close()

keep/errors.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Error logging utilities for keep CLI.
+Logs full stack traces to /tmp for debugging while showing clean messages to users.
+"""
+import traceback
+from datetime import datetime, timezone
+from pathlib import Path
+ERROR_LOG_PATH = Path("/tmp/keep-errors.log")
+def log_exception(exc: Exception, context: str = "") -> Path:
+    """
+    Log exception with full traceback to file.
+    Args:
+        exc: The exception that occurred
+        context: Optional context string (e.g., command name)
+    Returns:
+        Path to the error log file
+    """
+    timestamp = datetime.now(timezone.utc).isoformat()
+    with open(ERROR_LOG_PATH, "a") as f:
+        f.write(f"\n{'='*60}\n")
+        f.write(f"[{timestamp}]")
+        if context:
+            f.write(f" {context}")
+        f.write("\n")
+        f.write(traceback.format_exc())
+    return ERROR_LOG_PATH

keep/indexing.py CHANGED Viewed

@@ -71,7 +71,7 @@ class IndexingConfig:
     """Approximation for token estimation."""
     # Summarization settings (always used)
-    summary_max_chars: int = 500
+    summary_max_chars: int = 1000
     """Maximum summary length in characters."""
     # BM25 settings

keep/logging_config.py CHANGED Viewed

@@ -57,17 +57,48 @@ def configure_quiet_mode(quiet: bool = True):
 def enable_verbose_mode():
     """Re-enable verbose output for debugging."""
     configure_quiet_mode(quiet=False)
     # Restore defaults
     os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
     os.environ.pop("TRANSFORMERS_VERBOSITY", None)
     # Re-enable warnings
     warnings.filterwarnings("default")
     # Reset logging levels
     import logging
     logging.getLogger("transformers").setLevel(logging.INFO)
     logging.getLogger("sentence_transformers").setLevel(logging.INFO)
     logging.getLogger("mlx").setLevel(logging.INFO)
     logging.getLogger("chromadb").setLevel(logging.INFO)
+def enable_debug_mode():
+    """Enable debug-level logging to stderr."""
+    import logging
+    # Re-enable warnings
+    warnings.filterwarnings("default")
+    # Restore library verbosity
+    os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
+    os.environ.pop("TRANSFORMERS_VERBOSITY", None)
+    # Configure root logger for debug output
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.DEBUG)
+    # Add stderr handler if not already present
+    if not any(isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
+               for h in root_logger.handlers):
+        handler = logging.StreamHandler(sys.stderr)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(logging.Formatter(
+            "%(asctime)s %(levelname)s %(name)s: %(message)s",
+            datefmt="%H:%M:%S"
+        ))
+        root_logger.addHandler(handler)
+    # Set library loggers to DEBUG
+    for name in ("keep", "transformers", "sentence_transformers", "mlx", "chromadb"):
+        logging.getLogger(name).setLevel(logging.DEBUG)

keep-skill 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

keep-skill 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl