PyPI - code-memory - Versions diffs - 1.0.6__tar.gz → 1.0.9__tar.gz - Mend

code-memory 1.0.6tar.gz → 1.0.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{code_memory-1.0.6 → code_memory-1.0.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: code-memory
-Version: 1.0.6
+Version: 1.0.9
 Summary: A deterministic, high-precision code intelligence MCP server
 Project-URL: Homepage, https://github.com/kapillamba4/code-memory
 Project-URL: Documentation, https://github.com/kapillamba4/code-memory#readme
@@ -19,6 +19,7 @@ Requires-Python: >=3.13
 Requires-Dist: gitpython>=3.1.46
 Requires-Dist: markdown-it-py>=4.0.0
 Requires-Dist: mcp[cli]>=1.26.0
+Requires-Dist: pathspec>=0.12.1
 Requires-Dist: sentence-transformers>=5.2.3
 Requires-Dist: sqlite-vec>=0.1.6
 Requires-Dist: tree-sitter-c>=0.24.1
@@ -47,6 +48,43 @@ A deterministic, high-precision **code intelligence layer** exposed as a [Model
 `code-memory` gives your AI coding assistant structured access to your codebase through three focused pathways — eliminating context-window bloat and vague "search everything" queries.
+## Supported Languages
+### Full AST Support (Tree-sitter)
+These languages have structural parsing with symbol extraction (functions, classes, methods, etc.):
+| Language | Extensions |
+|----------|------------|
+| Python | `.py` |
+| JavaScript | `.js`, `.jsx` |
+| TypeScript | `.ts`, `.tsx` |
+| Java | `.java` |
+| Go | `.go` |
+| Rust | `.rs` |
+| C | `.c`, `.h` |
+| C++ | `.cpp`, `.hpp`, `.cc`, `.cxx` |
+| Ruby | `.rb` |
+| Kotlin | `.kt`, `.kts` |
+### Fallback Support (Whole-file Indexing)
+These file types are indexed as complete units for BM25 and semantic search:
+| Category | Extensions |
+|----------|------------|
+| C# | `.cs` |
+| Swift | `.swift` |
+| Scala | `.scala` |
+| Lua | `.lua` |
+| Shell | `.sh`, `.bash`, `.zsh` |
+| Config | `.yaml`, `.yml`, `.toml`, `.json` |
+| Web | `.html`, `.css`, `.scss` |
+| Database | `.sql` |
+| Docs | `.md`, `.txt` |
+> **Note:** Files and directories matching patterns in your `.gitignore` are automatically skipped during indexing. This excludes build artifacts, dependencies, and other generated files.
 ## Architecture: Progressive Disclosure
 Instead of a single monolithic search, `code-memory` routes queries through **three purpose-built tools**:

{code_memory-1.0.6 → code_memory-1.0.9}/README.md RENAMED Viewed

@@ -4,6 +4,43 @@ A deterministic, high-precision **code intelligence layer** exposed as a [Model
 `code-memory` gives your AI coding assistant structured access to your codebase through three focused pathways — eliminating context-window bloat and vague "search everything" queries.
+## Supported Languages
+### Full AST Support (Tree-sitter)
+These languages have structural parsing with symbol extraction (functions, classes, methods, etc.):
+| Language | Extensions |
+|----------|------------|
+| Python | `.py` |
+| JavaScript | `.js`, `.jsx` |
+| TypeScript | `.ts`, `.tsx` |
+| Java | `.java` |
+| Go | `.go` |
+| Rust | `.rs` |
+| C | `.c`, `.h` |
+| C++ | `.cpp`, `.hpp`, `.cc`, `.cxx` |
+| Ruby | `.rb` |
+| Kotlin | `.kt`, `.kts` |
+### Fallback Support (Whole-file Indexing)
+These file types are indexed as complete units for BM25 and semantic search:
+| Category | Extensions |
+|----------|------------|
+| C# | `.cs` |
+| Swift | `.swift` |
+| Scala | `.scala` |
+| Lua | `.lua` |
+| Shell | `.sh`, `.bash`, `.zsh` |
+| Config | `.yaml`, `.yml`, `.toml`, `.json` |
+| Web | `.html`, `.css`, `.scss` |
+| Database | `.sql` |
+| Docs | `.md`, `.txt` |
+> **Note:** Files and directories matching patterns in your `.gitignore` are automatically skipped during indexing. This excludes build artifacts, dependencies, and other generated files.
 ## Architecture: Progressive Disclosure
 Instead of a single monolithic search, `code-memory` routes queries through **three purpose-built tools**:

{code_memory-1.0.6 → code_memory-1.0.9}/db.py RENAMED Viewed

@@ -29,26 +29,42 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 _model = None
-EMBEDDING_DIM = 1024  # jina-code-embeddings-0.5b (Matryoshka truncated)
+_embedding_dim = None
+# Model identifier - change this if you switch to a different embedding model
+EMBEDDING_MODEL_NAME = "jinaai/jina-code-embeddings-0.5b"
 def get_embedding_model():
     """Lazy-load and cache the sentence-transformers model."""
-    global _model
+    global _model, _embedding_dim
     if _model is None:
         from sentence_transformers import SentenceTransformer
         _model = SentenceTransformer(
-            "jinaai/jina-code-embeddings-0.5b", trust_remote_code=True
+            EMBEDDING_MODEL_NAME, trust_remote_code=True
         )
+        # Cache the embedding dimension from the model
+        _embedding_dim = _model.get_sentence_embedding_dimension()
+        logger.info(f"Loaded embedding model with dimension: {_embedding_dim}")
     return _model
+def get_embedding_dim() -> int:
+    """Get the embedding dimension from the model.
+    Loads the model if not already loaded.
+    Returns the native embedding dimension of the model.
+    """
+    if _embedding_dim is None:
+        get_embedding_model()
+    return _embedding_dim
 def embed_text(text: str, task_type: str = "nl2code") -> list[float]:
     """Generate a dense vector embedding for *text*.
     Uses jina-code-embeddings with task prefix for better code retrieval.
-    Matryoshka embedding truncated to 1024 dims for efficiency.
     Args:
         text: The text to embed.
@@ -57,7 +73,7 @@ def embed_text(text: str, task_type: str = "nl2code") -> list[float]:
     model = get_embedding_model()
     prefixed_text = f"{task_type}: {text}"
     vec = model.encode(prefixed_text, normalize_embeddings=True, show_progress_bar=False)
-    return vec.tolist()[:EMBEDDING_DIM]
+    return vec.tolist()
 def embed_texts_batch(
@@ -93,7 +109,7 @@ def embed_texts_batch(
         convert_to_numpy=True,
     )
-    return [v.tolist()[:EMBEDDING_DIM] for v in vectors]
+    return [v.tolist() for v in vectors]
 def warmup_embedding_model() -> None:
@@ -141,6 +157,12 @@ def transaction(db: sqlite3.Connection):
 # ---------------------------------------------------------------------------
 _SCHEMA_SQL = """
+-- 0. Metadata table for tracking index version and model info
+CREATE TABLE IF NOT EXISTS index_metadata (
+    key   TEXT PRIMARY KEY,
+    value TEXT NOT NULL
+);
 -- 1. Tracked source files
 CREATE TABLE IF NOT EXISTS files (
     id            INTEGER PRIMARY KEY,
@@ -256,6 +278,9 @@ def get_db(project_dir: str) -> sqlite3.Connection:
     The database is stored as {project_dir}/code_memory.db to ensure each
     project has its own isolated index.
+    If the embedding model has changed since the last index, all indexed data
+    is automatically invalidated and the index will need to be rebuilt.
     Args:
         project_dir: The project directory where code_memory.db will be stored.
@@ -274,13 +299,83 @@ def get_db(project_dir: str) -> sqlite3.Connection:
     db.executescript(_SCHEMA_SQL)
-    # sqlite-vec virtual table for code embeddings (must be created outside executescript)
+    # Get embedding dimension from the model (loads model if needed)
+    embedding_dim = get_embedding_dim()
+    # Check if the embedding model has changed
+    stored_model = db.execute(
+        "SELECT value FROM index_metadata WHERE key = 'embedding_model'"
+    ).fetchone()
+    stored_dim = db.execute(
+        "SELECT value FROM index_metadata WHERE key = 'embedding_dim'"
+    ).fetchone()
+    model_changed = (
+        stored_model is None
+        or stored_model[0] != EMBEDDING_MODEL_NAME
+        or stored_dim is None
+        or int(stored_dim[0]) != embedding_dim
+    )
+    if model_changed:
+        if stored_model is not None:
+            # Model changed - invalidate existing index
+            logger.info(
+                f"Embedding model changed from '{stored_model[0] if stored_model else 'none'}' "
+                f"to '{EMBEDDING_MODEL_NAME}'. Invalidating index..."
+            )
+            _invalidate_index(db, embedding_dim)
+        else:
+            # New database - just create the embedding tables
+            _create_embedding_tables(db, embedding_dim)
+        # Store the current model info
+        db.execute(
+            "INSERT OR REPLACE INTO index_metadata (key, value) VALUES ('embedding_model', ?)",
+            (EMBEDDING_MODEL_NAME,)
+        )
+        db.execute(
+            "INSERT OR REPLACE INTO index_metadata (key, value) VALUES ('embedding_dim', ?)",
+            (str(embedding_dim),)
+        )
+        db.commit()
+    return db
+def _invalidate_index(db: sqlite3.Connection, embedding_dim: int) -> None:
+    """Invalidate the index by clearing all data and recreating embedding tables.
+    This is called when the embedding model changes.
+    """
+    # Drop existing embedding virtual tables
+    db.execute("DROP TABLE IF EXISTS symbol_embeddings")
+    db.execute("DROP TABLE IF EXISTS doc_embeddings")
+    # Clear all indexed data (cascades will handle related data via foreign keys,
+    # but we need to be explicit since FK enforcement may vary)
+    db.execute("DELETE FROM symbol_embeddings")
+    db.execute("DELETE FROM doc_embeddings")
+    db.execute("DELETE FROM symbols")
+    db.execute("DELETE FROM files")
+    db.execute("DELETE FROM references_")
+    db.execute("DELETE FROM doc_chunks")
+    db.execute("DELETE FROM doc_files")
+    # Recreate embedding tables with new dimension
+    _create_embedding_tables(db, embedding_dim)
+    logger.info("Index invalidated and embedding tables recreated")
+def _create_embedding_tables(db: sqlite3.Connection, embedding_dim: int) -> None:
+    """Create the embedding virtual tables with the specified dimension."""
+    # sqlite-vec virtual table for code embeddings
     db.execute(
         f"""
         CREATE VIRTUAL TABLE IF NOT EXISTS symbol_embeddings
         USING vec0(
             symbol_id INTEGER PRIMARY KEY,
-            embedding float[{EMBEDDING_DIM}]
+            embedding float[{embedding_dim}]
         )
         """
     )
@@ -291,12 +386,10 @@ def get_db(project_dir: str) -> sqlite3.Connection:
         CREATE VIRTUAL TABLE IF NOT EXISTS doc_embeddings
         USING vec0(
             chunk_id INTEGER PRIMARY KEY,
-            embedding float[{EMBEDDING_DIM}]
+            embedding float[{embedding_dim}]
         )
         """
     )
-    db.commit()
-    return db
 # ---------------------------------------------------------------------------

{code_memory-1.0.6 → code_memory-1.0.9}/parser.py RENAMED Viewed

@@ -14,19 +14,56 @@ import os
 from pathlib import Path
 from typing import Any
+import pathspec
 from tree_sitter import Language, Node, Parser
 import db as db_mod
 logger = logging.getLogger(__name__)
-# ── Directories to skip ───────────────────────────────────────────────
+# ── Directories to always skip (even without .gitignore) ───────────────
 _SKIP_DIRS = frozenset({
     ".venv", "venv", "__pycache__", ".git", "node_modules",
     ".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox",
     "dist", "build", "target", "bin", "obj",
 })
+def _load_gitignore_spec(root_dir: str) -> pathspec.PathSpec | None:
+    """Load .gitignore patterns from the given directory.
+    Returns a PathSpec object if .gitignore exists, None otherwise.
+    """
+    gitignore_path = os.path.join(root_dir, ".gitignore")
+    if not os.path.isfile(gitignore_path):
+        return None
+    try:
+        with open(gitignore_path, encoding="utf-8") as f:
+            lines = f.readlines()
+        return pathspec.PathSpec.from_lines("gitwildmatch", lines)
+    except (OSError, UnicodeDecodeError) as e:
+        logger.debug("Failed to read .gitignore: %s", e)
+        return None
+def _should_skip_path(
+    rel_path: str,
+    is_dir: bool,
+    gitignore_spec: pathspec.PathSpec | None,
+) -> bool:
+    """Check if a path should be skipped based on .gitignore patterns."""
+    if gitignore_spec is None:
+        return False
+    # Check both the path as-is and with trailing slash for directories
+    if gitignore_spec.match_file(rel_path):
+        return True
+    if is_dir and gitignore_spec.match_file(rel_path + "/"):
+        return True
+    return False
 # ── File extensions we consider "source code" ─────────────────────────
 _SOURCE_EXTENSIONS = frozenset({
     ".py", ".js", ".jsx", ".ts", ".tsx", ".java",
@@ -360,8 +397,8 @@ def index_file(filepath: str, db) -> dict:
 def index_directory(dirpath: str, db) -> list[dict]:
     """Recursively index all source files under *dirpath*.
-    Skips directories in ``_SKIP_DIRS`` and unchanged files.  Indexes any
-    file with a recognised source-code extension.
+    Skips directories in ``_SKIP_DIRS``, files matching ``.gitignore`` patterns,
+    and unchanged files.  Indexes any file with a recognised source-code extension.
     Args:
         dirpath: Root directory to scan.
@@ -376,12 +413,33 @@ def index_directory(dirpath: str, db) -> list[dict]:
     dirpath = os.path.abspath(dirpath)
     total_start = time.perf_counter()
+    # Load .gitignore patterns from the root directory
+    gitignore_spec = _load_gitignore_spec(dirpath)
+    if gitignore_spec:
+        logger.debug("Loaded .gitignore patterns from %s", dirpath)
     for root, dirs, files in os.walk(dirpath, topdown=True):
-        # Prune skipped directories in-place
-        dirs[:] = [d for d in dirs if d not in _SKIP_DIRS
-                   and not d.endswith(".egg-info")]
+        rel_root = os.path.relpath(root, dirpath)
+        # Prune skipped directories in-place (always-skip + gitignore)
+        def _should_keep_dir(d: str) -> bool:
+            if d in _SKIP_DIRS or d.endswith(".egg-info"):
+                return False
+            if gitignore_spec:
+                rel_path = os.path.join(rel_root, d) if rel_root != "." else d
+                if _should_skip_path(rel_path, is_dir=True, gitignore_spec=gitignore_spec):
+                    return False
+            return True
+        dirs[:] = [d for d in dirs if _should_keep_dir(d)]
         for fname in sorted(files):
+            # Skip files matching .gitignore patterns
+            if gitignore_spec:
+                rel_path = os.path.join(rel_root, fname) if rel_root != "." else fname
+                if _should_skip_path(rel_path, is_dir=False, gitignore_spec=gitignore_spec):
+                    continue
             ext = os.path.splitext(fname)[1].lower()
             # Accept files with known extensions, or files with a
             # tree-sitter grammar available

{code_memory-1.0.6 → code_memory-1.0.9}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "code-memory"
-version = "1.0.6"
+version = "1.0.9"
 description = "A deterministic, high-precision code intelligence MCP server"
 readme = "README.md"
 license = "MIT"
@@ -22,6 +22,7 @@ classifiers = [
 ]
 dependencies = [
     "gitpython>=3.1.46",
+    "pathspec>=0.12.1",
     "markdown-it-py>=4.0.0",
     "mcp[cli]>=1.26.0",
     "sentence-transformers>=5.2.3",

{code_memory-1.0.6 → code_memory-1.0.9}/uv.lock RENAMED Viewed

@@ -109,12 +109,13 @@ wheels = [
 [[package]]
 name = "code-memory"
-version = "1.0.4"
+version = "1.0.6"
 source = { editable = "." }
 dependencies = [
     { name = "gitpython" },
     { name = "markdown-it-py" },
     { name = "mcp", extra = ["cli"] },
+    { name = "pathspec" },
     { name = "sentence-transformers" },
     { name = "sqlite-vec" },
     { name = "tree-sitter" },
@@ -146,6 +147,7 @@ requires-dist = [
     { name = "markdown-it-py", specifier = ">=4.0.0" },
     { name = "mcp", extras = ["cli"], specifier = ">=1.26.0" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.13.0" },
+    { name = "pathspec", specifier = ">=0.12.1" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.24.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=5.0.0" },