PyPI - keep-skill - Versions diffs - 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

keep-skill 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

keep/__init__.py +3 -6
keep/api.py +1052 -145
keep/cli.py +705 -132
keep/config.py +172 -41
keep/context.py +1 -125
keep/document_store.py +908 -0
keep/errors.py +33 -0
keep/indexing.py +1 -1
keep/logging_config.py +34 -3
keep/paths.py +81 -17
keep/pending_summaries.py +52 -40
keep/providers/embedding_cache.py +59 -46
keep/providers/embeddings.py +43 -13
keep/providers/mlx.py +23 -21
keep/store.py +169 -25
keep_skill-0.3.0.dist-info/METADATA +218 -0
keep_skill-0.3.0.dist-info/RECORD +28 -0
keep_skill-0.1.0.dist-info/METADATA +0 -290
keep_skill-0.1.0.dist-info/RECORD +0 -26
{keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/WHEEL +0 -0
{keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/entry_points.txt +0 -0
{keep_skill-0.1.0.dist-info → keep_skill-0.3.0.dist-info}/licenses/LICENSE +0 -0

keep/errors.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Error logging utilities for keep CLI.
+Logs full stack traces to /tmp for debugging while showing clean messages to users.
+"""
+import traceback
+from datetime import datetime, timezone
+from pathlib import Path
+ERROR_LOG_PATH = Path("/tmp/keep-errors.log")
+def log_exception(exc: Exception, context: str = "") -> Path:
+    """
+    Log exception with full traceback to file.
+    Args:
+        exc: The exception that occurred
+        context: Optional context string (e.g., command name)
+    Returns:
+        Path to the error log file
+    """
+    timestamp = datetime.now(timezone.utc).isoformat()
+    with open(ERROR_LOG_PATH, "a") as f:
+        f.write(f"\n{'='*60}\n")
+        f.write(f"[{timestamp}]")
+        if context:
+            f.write(f" {context}")
+        f.write("\n")
+        f.write(traceback.format_exc())
+    return ERROR_LOG_PATH

keep/indexing.py CHANGED Viewed

@@ -71,7 +71,7 @@ class IndexingConfig:
     """Approximation for token estimation."""
     # Summarization settings (always used)
-    summary_max_chars: int = 500
+    summary_max_chars: int = 1000
     """Maximum summary length in characters."""
     # BM25 settings

keep/logging_config.py CHANGED Viewed

@@ -57,17 +57,48 @@ def configure_quiet_mode(quiet: bool = True):
 def enable_verbose_mode():
     """Re-enable verbose output for debugging."""
     configure_quiet_mode(quiet=False)
     # Restore defaults
     os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
     os.environ.pop("TRANSFORMERS_VERBOSITY", None)
     # Re-enable warnings
     warnings.filterwarnings("default")
     # Reset logging levels
     import logging
     logging.getLogger("transformers").setLevel(logging.INFO)
     logging.getLogger("sentence_transformers").setLevel(logging.INFO)
     logging.getLogger("mlx").setLevel(logging.INFO)
     logging.getLogger("chromadb").setLevel(logging.INFO)
+def enable_debug_mode():
+    """Enable debug-level logging to stderr."""
+    import logging
+    # Re-enable warnings
+    warnings.filterwarnings("default")
+    # Restore library verbosity
+    os.environ.pop("HF_HUB_DISABLE_PROGRESS_BARS", None)
+    os.environ.pop("TRANSFORMERS_VERBOSITY", None)
+    # Configure root logger for debug output
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.DEBUG)
+    # Add stderr handler if not already present
+    if not any(isinstance(h, logging.StreamHandler) and h.stream == sys.stderr
+               for h in root_logger.handlers):
+        handler = logging.StreamHandler(sys.stderr)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(logging.Formatter(
+            "%(asctime)s %(levelname)s %(name)s: %(message)s",
+            datefmt="%H:%M:%S"
+        ))
+        root_logger.addHandler(handler)
+    # Set library loggers to DEBUG
+    for name in ("keep", "transformers", "sentence_transformers", "mlx", "chromadb"):
+        logging.getLogger(name).setLevel(logging.DEBUG)

keep/paths.py CHANGED Viewed

@@ -1,11 +1,29 @@
 """
 Utility functions for locating paths.
+Config and store discovery follows this priority:
+Config discovery:
+1. KEEP_CONFIG envvar (path to .keep/ directory)
+2. Tree-walk from cwd up to ~, find first .keep/keep.toml
+3. ~/.keep/ default
+Store resolution:
+1. --store CLI option (passed to Keeper)
+2. KEEP_STORE_PATH envvar
+3. store.path in config file
+4. Config directory itself (backwards compat)
 """
+from __future__ import annotations
 import os
 import warnings
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+    from .config import StoreConfig
 def find_git_root(start_path: Optional[Path] = None) -> Optional[Path]:
@@ -35,14 +53,67 @@ def find_git_root(start_path: Optional[Path] = None) -> Optional[Path]:
     return None
-def get_default_store_path() -> Path:
+def find_config_dir(start_path: Optional[Path] = None) -> Path:
+    """
+    Find config directory by tree-walking from start_path up to home.
+    Looks for .keep/keep.toml at each directory level, stopping at home.
+    Returns the .keep/ directory containing the config, or ~/.keep/ if none found.
+    Args:
+        start_path: Path to start searching from. Defaults to cwd.
+    Returns:
+        Path to the .keep/ config directory.
+    """
+    if start_path is None:
+        start_path = Path.cwd()
+    home = Path.home()
+    current = start_path.resolve()
+    while True:
+        candidate = current / ".keep" / "keep.toml"
+        if candidate.exists():
+            return current / ".keep"
+        # Stop at home or filesystem root
+        if current == home or current == current.parent:
+            break
+        current = current.parent
+    # Default: ~/.keep/
+    return home / ".keep"
+def get_config_dir() -> Path:
+    """
+    Get the config directory.
+    Priority:
+    1. KEEP_CONFIG environment variable
+    2. Tree-walk from cwd up to ~ (find_config_dir)
+    Returns:
+        Path to the .keep/ config directory.
+    """
+    env = os.environ.get("KEEP_CONFIG")
+    if env:
+        return Path(env).expanduser().resolve()
+    return find_config_dir()
+def get_default_store_path(config: Optional[StoreConfig] = None) -> Path:
     """
     Get the default store path.
     Priority:
     1. KEEP_STORE_PATH environment variable
-    2. .keep/ directory at git repository root
-    3. ~/.keep/ in user's home directory (if not in a repo)
+    2. store.path setting in config file
+    3. Config directory itself (backwards compat)
+    Args:
+        config: Optional loaded config to check for store.path setting.
     Returns:
         Path to the store directory (may not exist yet).
@@ -51,17 +122,10 @@ def get_default_store_path() -> Path:
     env_path = os.environ.get("KEEP_STORE_PATH")
     if env_path:
         return Path(env_path).resolve()
-    # Try to find git root
-    git_root = find_git_root()
-    if git_root:
-        return git_root / ".keep"
-    # Fall back to home directory with warning
-    home = Path.home()
-    warnings.warn(
-        f"Not in a git repository. Using {home / '.keep'} for storage. "
-        f"Set KEEP_STORE_PATH to specify a different location.",
-        stacklevel=2,
-    )
-    return home / ".keep"
+    # Check config for explicit store.path
+    if config and config.store_path:
+        return Path(config.store_path).expanduser().resolve()
+    # Default: config directory is also the store
+    return get_config_dir()

keep/pending_summaries.py CHANGED Viewed

@@ -6,6 +6,7 @@ This enables fast indexing with lazy summarization.
 """
 import sqlite3
+import threading
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
@@ -37,12 +38,19 @@ class PendingSummaryQueue:
         """
         self._queue_path = queue_path
         self._conn: Optional[sqlite3.Connection] = None
+        self._lock = threading.Lock()
         self._init_db()
     def _init_db(self) -> None:
         """Initialize the SQLite database."""
         self._queue_path.parent.mkdir(parents=True, exist_ok=True)
         self._conn = sqlite3.connect(str(self._queue_path), check_same_thread=False)
+        # Enable WAL mode for better concurrent access across processes
+        self._conn.execute("PRAGMA journal_mode=WAL")
+        # Wait up to 5 seconds for locks instead of failing immediately
+        self._conn.execute("PRAGMA busy_timeout=5000")
         self._conn.execute("""
             CREATE TABLE IF NOT EXISTS pending_summaries (
                 id TEXT NOT NULL,
@@ -66,12 +74,13 @@ class PendingSummaryQueue:
         If the same id+collection already exists, replaces it (newer content wins).
         """
         now = datetime.now(timezone.utc).isoformat()
-        self._conn.execute("""
-            INSERT OR REPLACE INTO pending_summaries
-            (id, collection, content, queued_at, attempts)
-            VALUES (?, ?, ?, ?, 0)
-        """, (id, collection, content, now))
-        self._conn.commit()
+        with self._lock:
+            self._conn.execute("""
+                INSERT OR REPLACE INTO pending_summaries
+                (id, collection, content, queued_at, attempts)
+                VALUES (?, ?, ?, ?, 0)
+            """, (id, collection, content, now))
+            self._conn.commit()
     def dequeue(self, limit: int = 10) -> list[PendingSummary]:
         """
@@ -80,42 +89,44 @@ class PendingSummaryQueue:
         Items are returned but not removed - call `complete()` after successful processing.
         Increments attempt counter on each dequeue.
         """
-        cursor = self._conn.execute("""
-            SELECT id, collection, content, queued_at, attempts
-            FROM pending_summaries
-            ORDER BY queued_at ASC
-            LIMIT ?
-        """, (limit,))
-        items = []
-        for row in cursor.fetchall():
-            items.append(PendingSummary(
-                id=row[0],
-                collection=row[1],
-                content=row[2],
-                queued_at=row[3],
-                attempts=row[4],
-            ))
-        # Increment attempt counters
-        if items:
-            ids = [(item.id, item.collection) for item in items]
-            self._conn.executemany("""
-                UPDATE pending_summaries
-                SET attempts = attempts + 1
-                WHERE id = ? AND collection = ?
-            """, ids)
-            self._conn.commit()
+        with self._lock:
+            cursor = self._conn.execute("""
+                SELECT id, collection, content, queued_at, attempts
+                FROM pending_summaries
+                ORDER BY queued_at ASC
+                LIMIT ?
+            """, (limit,))
+            items = []
+            for row in cursor.fetchall():
+                items.append(PendingSummary(
+                    id=row[0],
+                    collection=row[1],
+                    content=row[2],
+                    queued_at=row[3],
+                    attempts=row[4],
+                ))
+            # Increment attempt counters
+            if items:
+                ids = [(item.id, item.collection) for item in items]
+                self._conn.executemany("""
+                    UPDATE pending_summaries
+                    SET attempts = attempts + 1
+                    WHERE id = ? AND collection = ?
+                """, ids)
+                self._conn.commit()
         return items
     def complete(self, id: str, collection: str) -> None:
         """Remove an item from the queue after successful processing."""
-        self._conn.execute("""
-            DELETE FROM pending_summaries
-            WHERE id = ? AND collection = ?
-        """, (id, collection))
-        self._conn.commit()
+        with self._lock:
+            self._conn.execute("""
+                DELETE FROM pending_summaries
+                WHERE id = ? AND collection = ?
+            """, (id, collection))
+            self._conn.commit()
     def count(self) -> int:
         """Get count of pending items."""
@@ -143,9 +154,10 @@ class PendingSummaryQueue:
     def clear(self) -> int:
         """Clear all pending items. Returns count of items cleared."""
-        count = self.count()
-        self._conn.execute("DELETE FROM pending_summaries")
-        self._conn.commit()
+        with self._lock:
+            count = self.count()
+            self._conn.execute("DELETE FROM pending_summaries")
+            self._conn.commit()
         return count
     def close(self) -> None:

keep/providers/embedding_cache.py CHANGED Viewed

@@ -8,6 +8,7 @@ avoiding redundant embedding calls for unchanged content.
 import hashlib
 import json
 import sqlite3
+import threading
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Optional
@@ -32,12 +33,19 @@ class EmbeddingCache:
         self._cache_path = cache_path
         self._max_entries = max_entries
         self._conn: Optional[sqlite3.Connection] = None
+        self._lock = threading.RLock()
         self._init_db()
     def _init_db(self) -> None:
         """Initialize the SQLite database."""
         self._cache_path.parent.mkdir(parents=True, exist_ok=True)
         self._conn = sqlite3.connect(str(self._cache_path), check_same_thread=False)
+        # Enable WAL mode for better concurrent access across processes
+        self._conn.execute("PRAGMA journal_mode=WAL")
+        # Wait up to 5 seconds for locks instead of failing immediately
+        self._conn.execute("PRAGMA busy_timeout=5000")
         self._conn.execute("""
             CREATE TABLE IF NOT EXISTS embedding_cache (
                 content_hash TEXT PRIMARY KEY,
@@ -62,28 +70,30 @@ class EmbeddingCache:
     def get(self, model_name: str, content: str) -> Optional[list[float]]:
         """
         Get cached embedding if it exists.
         Updates last_accessed timestamp on hit.
         """
         content_hash = self._hash_key(model_name, content)
-        cursor = self._conn.execute(
-            "SELECT embedding FROM embedding_cache WHERE content_hash = ?",
-            (content_hash,)
-        )
-        row = cursor.fetchone()
-        if row is not None:
-            # Update last_accessed
-            now = datetime.now(timezone.utc).isoformat()
-            self._conn.execute(
-                "UPDATE embedding_cache SET last_accessed = ? WHERE content_hash = ?",
-                (now, content_hash)
+        with self._lock:
+            cursor = self._conn.execute(
+                "SELECT embedding FROM embedding_cache WHERE content_hash = ?",
+                (content_hash,)
             )
-            self._conn.commit()
-            # Deserialize embedding
-            return json.loads(row[0])
+            row = cursor.fetchone()
+            if row is not None:
+                # Update last_accessed
+                now = datetime.now(timezone.utc).isoformat()
+                self._conn.execute(
+                    "UPDATE embedding_cache SET last_accessed = ? WHERE content_hash = ?",
+                    (now, content_hash)
+                )
+                self._conn.commit()
+                # Deserialize embedding
+                return json.loads(row[0])
         return None
     def put(
@@ -94,40 +104,42 @@ class EmbeddingCache:
     ) -> None:
         """
         Cache an embedding.
         Evicts oldest entries if cache exceeds max_entries.
         """
         content_hash = self._hash_key(model_name, content)
         now = datetime.now(timezone.utc).isoformat()
         embedding_blob = json.dumps(embedding)
-        self._conn.execute("""
-            INSERT OR REPLACE INTO embedding_cache
-            (content_hash, model_name, embedding, dimension, created_at, last_accessed)
-            VALUES (?, ?, ?, ?, ?, ?)
-        """, (content_hash, model_name, embedding_blob, len(embedding), now, now))
-        self._conn.commit()
-        # Evict old entries if needed
-        self._maybe_evict()
+        with self._lock:
+            self._conn.execute("""
+                INSERT OR REPLACE INTO embedding_cache
+                (content_hash, model_name, embedding, dimension, created_at, last_accessed)
+                VALUES (?, ?, ?, ?, ?, ?)
+            """, (content_hash, model_name, embedding_blob, len(embedding), now, now))
+            self._conn.commit()
+            # Evict old entries if needed
+            self._maybe_evict()
     def _maybe_evict(self) -> None:
         """Evict oldest entries if cache exceeds max size."""
-        cursor = self._conn.execute("SELECT COUNT(*) FROM embedding_cache")
-        count = cursor.fetchone()[0]
-        if count > self._max_entries:
-            # Delete oldest 10% by last_accessed
-            evict_count = max(1, count // 10)
-            self._conn.execute("""
-                DELETE FROM embedding_cache
-                WHERE content_hash IN (
-                    SELECT content_hash FROM embedding_cache
-                    ORDER BY last_accessed ASC
-                    LIMIT ?
-                )
-            """, (evict_count,))
-            self._conn.commit()
+        with self._lock:
+            cursor = self._conn.execute("SELECT COUNT(*) FROM embedding_cache")
+            count = cursor.fetchone()[0]
+            if count > self._max_entries:
+                # Delete oldest 10% by last_accessed
+                evict_count = max(1, count // 10)
+                self._conn.execute("""
+                    DELETE FROM embedding_cache
+                    WHERE content_hash IN (
+                        SELECT content_hash FROM embedding_cache
+                        ORDER BY last_accessed ASC
+                        LIMIT ?
+                    )
+                """, (evict_count,))
+                self._conn.commit()
     def stats(self) -> dict:
         """Get cache statistics."""
@@ -145,8 +157,9 @@ class EmbeddingCache:
     def clear(self) -> None:
         """Clear all cached embeddings."""
-        self._conn.execute("DELETE FROM embedding_cache")
-        self._conn.commit()
+        with self._lock:
+            self._conn.execute("DELETE FROM embedding_cache")
+            self._conn.commit()
     def close(self) -> None:
         """Close the database connection."""

keep/providers/embeddings.py CHANGED Viewed

@@ -11,12 +11,12 @@ from .base import EmbeddingProvider, get_registry
 class SentenceTransformerEmbedding:
     """
     Embedding provider using sentence-transformers library.
     Runs locally, no API key required. Good default for getting started.
     Requires: pip install sentence-transformers
     """
     def __init__(self, model: str = "all-MiniLM-L6-v2"):
         """
         Args:
@@ -29,9 +29,21 @@ class SentenceTransformerEmbedding:
                 "SentenceTransformerEmbedding requires 'sentence-transformers' library. "
                 "Install with: pip install sentence-transformers"
             )
         self.model_name = model
-        self._model = SentenceTransformer(model)
+        # Check if model is already cached locally to avoid network calls
+        # Expand short model names (e.g. "all-MiniLM-L6-v2" -> "sentence-transformers/all-MiniLM-L6-v2")
+        local_only = False
+        try:
+            from huggingface_hub import try_to_load_from_cache
+            repo_id = model if "/" in model else f"sentence-transformers/{model}"
+            cached = try_to_load_from_cache(repo_id, "config.json")
+            local_only = cached is not None
+        except ImportError:
+            pass
+        self._model = SentenceTransformer(model, local_files_only=local_only)
     @property
     def dimension(self) -> int:
@@ -83,8 +95,9 @@ class OpenAIEmbedding:
             )
         self.model_name = model
-        self._dimension = self.MODEL_DIMENSIONS.get(model, 1536)
+        # Use lookup table if available, otherwise detect lazily from first embedding
+        self._dimension = self.MODEL_DIMENSIONS.get(model)
         # Resolve API key
         key = api_key or os.environ.get("KEEP_OPENAI_API_KEY") or os.environ.get("OPENAI_API_KEY")
         if not key:
@@ -96,16 +109,24 @@ class OpenAIEmbedding:
     @property
     def dimension(self) -> int:
-        """Get embedding dimension for the model."""
+        """Get embedding dimension for the model (detected lazily if unknown)."""
+        if self._dimension is None:
+            # Unknown model: detect from first embedding
+            test_embedding = self.embed("dimension test")
+            self._dimension = len(test_embedding)
         return self._dimension
     def embed(self, text: str) -> list[float]:
         """Generate embedding for a single text."""
         response = self._client.embeddings.create(
             model=self.model_name,
             input=text,
         )
-        return response.data[0].embedding
+        embedding = response.data[0].embedding
+        # Cache dimension if not yet known
+        if self._dimension is None:
+            self._dimension = len(embedding)
+        return embedding
     def embed_batch(self, texts: list[str]) -> list[list[float]]:
         """Generate embeddings for multiple texts."""
@@ -152,7 +173,8 @@ class GeminiEmbedding:
             )
         self.model_name = model
-        self._dimension = self.MODEL_DIMENSIONS.get(model, 768)
+        # Use lookup table if available, otherwise detect lazily from first embedding
+        self._dimension = self.MODEL_DIMENSIONS.get(model)
         # Resolve API key
         key = api_key or os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
@@ -165,7 +187,11 @@ class GeminiEmbedding:
     @property
     def dimension(self) -> int:
-        """Get embedding dimension for the model."""
+        """Get embedding dimension for the model (detected lazily if unknown)."""
+        if self._dimension is None:
+            # Unknown model: detect from first embedding
+            test_embedding = self.embed("dimension test")
+            self._dimension = len(test_embedding)
         return self._dimension
     def embed(self, text: str) -> list[float]:
@@ -174,7 +200,11 @@ class GeminiEmbedding:
             model=self.model_name,
             contents=text,
         )
-        return list(result.embeddings[0].values)
+        embedding = list(result.embeddings[0].values)
+        # Cache dimension if not yet known
+        if self._dimension is None:
+            self._dimension = len(embedding)
+        return embedding
     def embed_batch(self, texts: list[str]) -> list[list[float]]:
         """Generate embeddings for multiple texts."""

keep-skill 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

keep-skill 0.1.0py3-none-any.whl → 0.3.0py3-none-any.whl