PyPI - code-memory - Versions diffs - 1.0.3__tar.gz → 1.0.5__tar.gz - Mend

code-memory 1.0.3tar.gz → 1.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{code_memory-1.0.3 → code_memory-1.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: code-memory
-Version: 1.0.3
+Version: 1.0.5
 Summary: A deterministic, high-precision code intelligence MCP server
 Project-URL: Homepage, https://github.com/kapillamba4/code-memory
 Project-URL: Documentation, https://github.com/kapillamba4/code-memory#readme
@@ -32,6 +32,7 @@ Requires-Dist: tree-sitter-ruby>=0.23.1
 Requires-Dist: tree-sitter-rust>=0.24.0
 Requires-Dist: tree-sitter-typescript>=0.23.2
 Requires-Dist: tree-sitter>=0.25.2
+Requires-Dist: xxhash>=3.6.0
 Provides-Extra: dev
 Requires-Dist: mypy>=1.13.0; extra == 'dev'
 Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'

{code_memory-1.0.3 → code_memory-1.0.5}/db.py RENAMED Viewed

@@ -11,16 +11,19 @@ All writes use upsert semantics so re-indexing is idempotent.
 from __future__ import annotations
-import hashlib
+import logging
 import sqlite3
-from pathlib import Path
+from contextlib import contextmanager
 from typing import TYPE_CHECKING
 import sqlite_vec
+import xxhash
 if TYPE_CHECKING:
     pass
+logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Embedding model (lazy-loaded singleton)
 # ---------------------------------------------------------------------------
@@ -42,10 +45,80 @@ def get_embedding_model():
 def embed_text(text: str) -> list[float]:
     """Generate a 384-dim dense vector embedding for *text*."""
     model = get_embedding_model()
-    vec = model.encode(text, normalize_embeddings=True)
+    vec = model.encode(text, normalize_embeddings=True, show_progress_bar=False)
     return vec.tolist()
+def embed_texts_batch(texts: list[str], batch_size: int = 32) -> list[list[float]]:
+    """Generate embeddings for multiple texts at once.
+    This is significantly faster than calling embed_text() in a loop
+    because sentence-transformers is optimized for batch processing.
+    Args:
+        texts: List of text strings to embed.
+        batch_size: Number of texts to process per batch (default 32).
+    Returns:
+        List of embedding vectors (same order as input texts).
+    """
+    if not texts:
+        return []
+    model = get_embedding_model()
+    # Batch encode with normalization (same as single-text version)
+    vectors = model.encode(
+        texts,
+        batch_size=batch_size,
+        normalize_embeddings=True,
+        show_progress_bar=False,
+        convert_to_numpy=True,
+    )
+    return [v.tolist() for v in vectors]
+def warmup_embedding_model() -> None:
+    """Pre-load and warm up the embedding model.
+    Call this at server startup to avoid cold-start latency on first search.
+    The warmup encodes a dummy string to initialize internal tensors.
+    """
+    model = get_embedding_model()
+    # Warmup encode to initialize lazy-loaded components
+    model.encode("warmup", normalize_embeddings=True, show_progress_bar=False)
+    logger.info("Embedding model warmed up")
+# ---------------------------------------------------------------------------
+# Transaction support
+# ---------------------------------------------------------------------------
+@contextmanager
+def transaction(db: sqlite3.Connection):
+    """Context manager for explicit transaction control.
+    Disables autocommit, yields control, then commits on success.
+    On exception, rolls back automatically.
+    Example:
+        with transaction(db):
+            for item in items:
+                upsert_symbol(db, ..., auto_commit=False)
+        # Single commit here
+    """
+    # Disable autocommit by starting a transaction
+    db.execute("BEGIN")
+    try:
+        yield db
+        db.commit()
+    except Exception:
+        db.rollback()
+        raise
 # ---------------------------------------------------------------------------
 # Database initialisation
 # ---------------------------------------------------------------------------
@@ -210,17 +283,34 @@ def get_db(db_path: str = "code_memory.db") -> sqlite3.Connection:
 def file_hash(filepath: str) -> str:
-    """Compute SHA-256 hex digest of a file's contents."""
-    h = hashlib.sha256()
+    """Compute fast non-cryptographic hash of a file's contents.
+    Uses xxHash (xxh64) which is ~10x faster than SHA-256 while still
+    providing excellent collision resistance for change detection.
+    Args:
+        filepath: Path to the file to hash.
+    Returns:
+        Hexadecimal string representation of the 64-bit hash.
+    """
+    h = xxhash.xxh64()
     with open(filepath, "rb") as f:
-        for chunk in iter(lambda: f.read(8192), b""):
+        # Read in 64KB chunks for memory efficiency
+        for chunk in iter(lambda: f.read(65536), b""):
             h.update(chunk)
     return h.hexdigest()
-def upsert_file(db: sqlite3.Connection, path: str, last_modified: float, fhash: str) -> int:
+def upsert_file(
+    db: sqlite3.Connection,
+    path: str,
+    last_modified: float,
+    fhash: str,
+    auto_commit: bool = True,
+) -> int:
     """Insert or update a file record. Returns the file_id."""
-    cur = db.execute(
+    db.execute(
         """
         INSERT INTO files (path, last_modified, file_hash)
         VALUES (?, ?, ?)
@@ -230,13 +320,14 @@ def upsert_file(db: sqlite3.Connection, path: str, last_modified: float, fhash:
         """,
         (path, last_modified, fhash),
     )
-    db.commit()
+    if auto_commit:
+        db.commit()
     # Fetch the id (needed because last_insert_rowid isn't reliable on update)
     row = db.execute("SELECT id FROM files WHERE path = ?", (path,)).fetchone()
     return row[0]
-def delete_file_data(db: sqlite3.Connection, file_id: int) -> None:
+def delete_file_data(db: sqlite3.Connection, file_id: int, auto_commit: bool = True) -> None:
     """Remove all symbols, embeddings, and references for a file.
     This is called before re-indexing to guarantee idempotency.
@@ -251,7 +342,8 @@ def delete_file_data(db: sqlite3.Connection, file_id: int) -> None:
     db.execute("DELETE FROM symbols WHERE file_id = ?", (file_id,))
     db.execute("DELETE FROM references_ WHERE file_id = ?", (file_id,))
-    db.commit()
+    if auto_commit:
+        db.commit()
 def upsert_symbol(
@@ -263,6 +355,7 @@ def upsert_symbol(
     line_end: int,
     parent_symbol_id: int | None,
     source_text: str,
+    auto_commit: bool = True,
 ) -> int:
     """Insert or update a symbol record. Returns the symbol_id."""
     db.execute(
@@ -277,7 +370,8 @@ def upsert_symbol(
         """,
         (name, kind, file_id, line_start, line_end, parent_symbol_id, source_text),
     )
-    db.commit()
+    if auto_commit:
+        db.commit()
     row = db.execute(
         "SELECT id FROM symbols WHERE file_id = ? AND name = ? AND kind = ? AND line_start = ?",
         (file_id, name, kind, line_start),
@@ -286,7 +380,11 @@ def upsert_symbol(
 def upsert_reference(
-    db: sqlite3.Connection, symbol_name: str, file_id: int, line_number: int
+    db: sqlite3.Connection,
+    symbol_name: str,
+    file_id: int,
+    line_number: int,
+    auto_commit: bool = True,
 ) -> None:
     """Insert or update a cross-reference record."""
     db.execute(
@@ -297,10 +395,16 @@ def upsert_reference(
         """,
         (symbol_name, file_id, line_number),
     )
-    db.commit()
+    if auto_commit:
+        db.commit()
-def upsert_embedding(db: sqlite3.Connection, symbol_id: int, embedding: list[float]) -> None:
+def upsert_embedding(
+    db: sqlite3.Connection,
+    symbol_id: int,
+    embedding: list[float],
+    auto_commit: bool = True,
+) -> None:
     """Insert or replace a symbol's dense vector embedding."""
     import struct
@@ -311,7 +415,8 @@ def upsert_embedding(db: sqlite3.Connection, symbol_id: int, embedding: list[flo
         "INSERT INTO symbol_embeddings (symbol_id, embedding) VALUES (?, ?)",
         (symbol_id, blob),
     )
-    db.commit()
+    if auto_commit:
+        db.commit()
 # ---------------------------------------------------------------------------
@@ -320,7 +425,12 @@ def upsert_embedding(db: sqlite3.Connection, symbol_id: int, embedding: list[flo
 def upsert_doc_file(
-    db: sqlite3.Connection, path: str, last_modified: float, fhash: str, doc_type: str
+    db: sqlite3.Connection,
+    path: str,
+    last_modified: float,
+    fhash: str,
+    doc_type: str,
+    auto_commit: bool = True,
 ) -> int:
     """Insert or update a documentation file record. Returns doc_file_id."""
     db.execute(
@@ -334,12 +444,13 @@ def upsert_doc_file(
         """,
         (path, last_modified, fhash, doc_type),
     )
-    db.commit()
+    if auto_commit:
+        db.commit()
     row = db.execute("SELECT id FROM doc_files WHERE path = ?", (path,)).fetchone()
     return row[0]
-def delete_doc_file_data(db: sqlite3.Connection, doc_file_id: int) -> None:
+def delete_doc_file_data(db: sqlite3.Connection, doc_file_id: int, auto_commit: bool = True) -> None:
     """Remove all chunks and embeddings for a documentation file.
     This is called before re-indexing to guarantee idempotency.
@@ -356,7 +467,8 @@ def delete_doc_file_data(db: sqlite3.Connection, doc_file_id: int) -> None:
         db.execute(f"DELETE FROM doc_embeddings WHERE chunk_id IN ({placeholders})", chunk_ids)
     db.execute("DELETE FROM doc_chunks WHERE doc_file_id = ?", (doc_file_id,))
-    db.commit()
+    if auto_commit:
+        db.commit()
 def upsert_doc_chunk(
@@ -367,6 +479,7 @@ def upsert_doc_chunk(
     content: str,
     line_start: int,
     line_end: int,
+    auto_commit: bool = True,
 ) -> int:
     """Insert or update a documentation chunk. Returns chunk_id."""
     db.execute(
@@ -382,7 +495,8 @@ def upsert_doc_chunk(
         """,
         (doc_file_id, chunk_index, section_title, content, line_start, line_end),
     )
-    db.commit()
+    if auto_commit:
+        db.commit()
     row = db.execute(
         "SELECT id FROM doc_chunks WHERE doc_file_id = ? AND chunk_index = ?",
         (doc_file_id, chunk_index),
@@ -390,7 +504,12 @@ def upsert_doc_chunk(
     return row[0]
-def upsert_doc_embedding(db: sqlite3.Connection, chunk_id: int, embedding: list[float]) -> None:
+def upsert_doc_embedding(
+    db: sqlite3.Connection,
+    chunk_id: int,
+    embedding: list[float],
+    auto_commit: bool = True,
+) -> None:
     """Insert or replace a documentation chunk's dense vector embedding."""
     import struct
@@ -400,4 +519,5 @@ def upsert_doc_embedding(db: sqlite3.Connection, chunk_id: int, embedding: list[
         "INSERT INTO doc_embeddings (chunk_id, embedding) VALUES (?, ?)",
         (chunk_id, blob),
     )
-    db.commit()
+    if auto_commit:
+        db.commit()

{code_memory-1.0.3 → code_memory-1.0.5}/doc_parser.py RENAMED Viewed

@@ -7,10 +7,8 @@ and indexes them for hybrid retrieval (BM25 + vector search).
 from __future__ import annotations
-import hashlib
 import os
 import re
-from pathlib import Path
 from markdown_it import MarkdownIt
@@ -239,7 +237,7 @@ def index_doc_file(
     overlap: int = DEFAULT_OVERLAP,
     min_chunk_size: int = DEFAULT_MIN_CHUNK_SIZE,
 ) -> dict:
-    """Index a documentation file.
+    """Index a documentation file with batch embeddings and transaction.
     Args:
         filepath: Path to the documentation file.
@@ -259,7 +257,7 @@ def index_doc_file(
     # Check if file has changed
     stat = os.stat(abs_path)
     last_modified = stat.st_mtime
-    fhash = db_mod.file_hash(abs_path)
+    fhash = db_mod.file_hash(abs_path)  # Now uses xxHash
     existing = db.execute(
         "SELECT id, file_hash FROM doc_files WHERE path = ?", (abs_path,)
@@ -285,8 +283,9 @@ def index_doc_file(
     # Parse and chunk
     sections = parse_markdown_sections(abs_path)
-    chunks_indexed = 0
-    chunk_index = 0
+    # === BATCH PROCESSING ===
+    chunks_to_store: list[dict] = []
+    embed_inputs: list[str] = []
     for section in sections:
         content = section["content"]
@@ -300,22 +299,34 @@ def index_doc_file(
             if len(sub_content) < min_chunk_size:
                 continue
-            chunk_id = db_mod.upsert_doc_chunk(
-                db,
-                doc_file_id,
-                chunk_index,
-                section["section_title"],
-                sub_content,
-                section["line_start"],
-                section["line_end"],
-            )
-            # Generate and store embedding
-            embedding = db_mod.embed_text(f"{section['section_title'] or ''}: {sub_content}")
-            db_mod.upsert_doc_embedding(db, chunk_id, embedding)
+            chunks_to_store.append({
+                "section_title": section["section_title"],
+                "content": sub_content,
+                "line_start": section["line_start"],
+                "line_end": section["line_end"],
+            })
+            embed_input = f"{section['section_title'] or ''}: {sub_content}"
+            embed_inputs.append(embed_input)
-            chunk_index += 1
-            chunks_indexed += 1
+    # Batch embed all chunks
+    chunks_indexed = 0
+    if embed_inputs:
+        embeddings = db_mod.embed_texts_batch(embed_inputs, batch_size=64)
+        with db_mod.transaction(db):
+            for i, chunk in enumerate(chunks_to_store):
+                chunk_id = db_mod.upsert_doc_chunk(
+                    db,
+                    doc_file_id,
+                    i,  # chunk_index
+                    chunk["section_title"],
+                    chunk["content"],
+                    chunk["line_start"],
+                    chunk["line_end"],
+                    auto_commit=False,
+                )
+                db_mod.upsert_doc_embedding(db, chunk_id, embeddings[i], auto_commit=False)
+                chunks_indexed += 1
     return {
         "file": filepath,
@@ -356,8 +367,7 @@ def index_doc_directory(dirpath: str, db) -> list[dict]:
 def extract_docstrings_from_code(db) -> list[dict]:
     """Extract docstrings from already-indexed code symbols.
-    This function queries the existing symbols table and extracts
-    docstrings from the source_text field.
+    Uses batch embedding generation for better performance.
     Args:
         db: Database connection.
@@ -377,6 +387,10 @@ def extract_docstrings_from_code(db) -> list[dict]:
         """
     ).fetchall()
+    # === BATCH PROCESSING ===
+    docstrings_to_store: list[dict] = []
+    embed_inputs: list[str] = []
     for row in rows:
         symbol_id, name, kind, file_path, line_start, line_end, source_text = row
@@ -398,50 +412,68 @@ def extract_docstrings_from_code(db) -> list[dict]:
         if existing:
             continue
-        # Create a doc_file entry for the code file if needed
-        doc_file = db.execute(
-            "SELECT id FROM doc_files WHERE path = ?", (file_path,)
-        ).fetchone()
-        if not doc_file:
-            # Get file stats
-            stat = os.stat(file_path) if os.path.exists(file_path) else None
-            doc_file_id = db_mod.upsert_doc_file(
-                db,
-                file_path,
-                stat.st_mtime if stat else 0,
-                db_mod.file_hash(file_path) if stat else "",
-                "docstring",
-            )
-        else:
-            doc_file_id = doc_file[0]
-        # Get next chunk index
-        max_idx = db.execute(
-            "SELECT COALESCE(MAX(chunk_index), -1) FROM doc_chunks WHERE doc_file_id = ?",
-            (doc_file_id,),
-        ).fetchone()[0]
-        chunk_id = db_mod.upsert_doc_chunk(
-            db,
-            doc_file_id,
-            max_idx + 1,
-            name,  # Use symbol name as section title
-            docstring,
-            line_start,
-            line_end,
-        )
-        # Generate and store embedding
-        embedding = db_mod.embed_text(f"{kind} {name}: {docstring}")
-        db_mod.upsert_doc_embedding(db, chunk_id, embedding)
-        results.append({
-            "symbol": name,
+        docstrings_to_store.append({
+            "name": name,
             "kind": kind,
-            "file": file_path,
-            "docstring_length": len(docstring),
+            "file_path": file_path,
+            "line_start": line_start,
+            "line_end": line_end,
+            "docstring": docstring,
         })
+        embed_inputs.append(f"{kind} {name}: {docstring}")
+    # Batch embed all docstrings
+    if embed_inputs:
+        embeddings = db_mod.embed_texts_batch(embed_inputs, batch_size=64)
+        with db_mod.transaction(db):
+            for i, doc_info in enumerate(docstrings_to_store):
+                file_path = doc_info["file_path"]
+                # Create a doc_file entry for the code file if needed
+                doc_file = db.execute(
+                    "SELECT id FROM doc_files WHERE path = ?", (file_path,)
+                ).fetchone()
+                if not doc_file:
+                    # Get file stats
+                    stat = os.stat(file_path) if os.path.exists(file_path) else None
+                    doc_file_id = db_mod.upsert_doc_file(
+                        db,
+                        file_path,
+                        stat.st_mtime if stat else 0,
+                        db_mod.file_hash(file_path) if stat else "",
+                        "docstring",
+                        auto_commit=False,
+                    )
+                else:
+                    doc_file_id = doc_file[0]
+                # Get next chunk index
+                max_idx = db.execute(
+                    "SELECT COALESCE(MAX(chunk_index), -1) FROM doc_chunks WHERE doc_file_id = ?",
+                    (doc_file_id,),
+                ).fetchone()[0]
+                chunk_id = db_mod.upsert_doc_chunk(
+                    db,
+                    doc_file_id,
+                    max_idx + 1,
+                    doc_info["name"],  # Use symbol name as section title
+                    doc_info["docstring"],
+                    doc_info["line_start"],
+                    doc_info["line_end"],
+                    auto_commit=False,
+                )
+                db_mod.upsert_doc_embedding(db, chunk_id, embeddings[i], auto_commit=False)
+                results.append({
+                    "symbol": doc_info["name"],
+                    "kind": doc_info["kind"],
+                    "file": file_path,
+                    "docstring_length": len(doc_info["docstring"]),
+                })
     return results

{code_memory-1.0.3 → code_memory-1.0.5}/git_search.py RENAMED Viewed

@@ -15,14 +15,13 @@ Design rules
 from __future__ import annotations
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any
 import git
 from git.exc import InvalidGitRepositoryError, NoSuchPathError
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -34,7 +33,7 @@ def _commit_to_dict(commit: git.Commit, *, include_files_changed_count: bool = F
         include_files_changed_count: If True, compute the number of files
             changed (triggers a diff — slow for bulk iteration).
     """
-    dt = datetime.fromtimestamp(commit.committed_date, tz=timezone.utc)
+    dt = datetime.fromtimestamp(commit.committed_date, tz=UTC)
     result: dict[str, Any] = {
         "hash": commit.hexsha[:7],
         "full_hash": commit.hexsha,
@@ -143,7 +142,7 @@ def get_commit_detail(
         return {"error": f"Could not resolve commit '{commit_hash}': {exc}"}
     try:
-        dt = datetime.fromtimestamp(commit.committed_date, tz=timezone.utc)
+        dt = datetime.fromtimestamp(commit.committed_date, tz=UTC)
         parent_hashes = [p.hexsha[:7] for p in commit.parents]
@@ -271,7 +270,7 @@ def get_blame(
                     "full_hash": commit.hexsha,
                     "author": str(commit.author),
                     "date": datetime.fromtimestamp(
-                        commit.committed_date, tz=timezone.utc
+                        commit.committed_date, tz=UTC
                     ).isoformat(),
                     "line_content": line_text,
                     "commit_message": commit.message.strip().split("\n")[0],

{code_memory-1.0.3 → code_memory-1.0.5}/logging_config.py RENAMED Viewed

@@ -10,6 +10,8 @@ from __future__ import annotations
 import logging
 import os
 import sys
+import time
+from contextlib import contextmanager
 from datetime import datetime
 from typing import TextIO
@@ -24,6 +26,27 @@ DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
 _initialized = False
+@contextmanager
+def log_timing(operation_name: str, logger: logging.Logger):
+    """Context manager to log operation timing.
+    Args:
+        operation_name: Name of the operation being timed.
+        logger: Logger instance to use for logging.
+    Example:
+        with log_timing("Indexing myfile.py", logger):
+            # ... indexing code ...
+    """
+    start = time.perf_counter()
+    logger.debug(f"{operation_name} started")
+    try:
+        yield
+    finally:
+        elapsed = time.perf_counter() - start
+        logger.info(f"{operation_name} completed in {elapsed:.2f}s")
 def setup_logging(level: str = LOG_LEVEL, stream: TextIO = sys.stderr) -> logging.Logger:
     """Configure structured logging for code-memory.
@@ -96,7 +119,7 @@ class ToolLogger:
         self.result_count: int | None = None
         self.error: str | None = None
-    def __enter__(self) -> "ToolLogger":
+    def __enter__(self) -> ToolLogger:
         self.start_time = datetime.now()
         # Sanitize params for logging (don't log sensitive data)
         safe_params = {k: v for k, v in self.params.items() if v is not None}

code-memory 1.0.3__tar.gz → 1.0.5__tar.gz

code-memory 1.0.3tar.gz → 1.0.5tar.gz