PyPI - code-memory - Versions diffs - 1.0.17__tar.gz → 1.0.18__tar.gz - Mend

code-memory 1.0.17tar.gz → 1.0.18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{code_memory-1.0.17 → code_memory-1.0.18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: code-memory
-Version: 1.0.17
+Version: 1.0.18
 Summary: A deterministic, high-precision code intelligence MCP server
 Project-URL: Homepage, https://github.com/kapillamba4/code-memory
 Project-URL: Documentation, https://github.com/kapillamba4/code-memory#readme

{code_memory-1.0.17 → code_memory-1.0.18}/db.py RENAMED Viewed

@@ -638,3 +638,108 @@ def upsert_doc_embedding(
     )
     if auto_commit:
         db.commit()
+# ---------------------------------------------------------------------------
+# Index Statistics
+# ---------------------------------------------------------------------------
+def get_index_stats(db: sqlite3.Connection, project_dir: str) -> dict:
+    """Get comprehensive statistics about the index.
+    Args:
+        db: An open sqlite3.Connection.
+        project_dir: The project directory path.
+    Returns:
+        Dictionary with index health metrics including:
+        - Total symbols, files, doc chunks indexed
+        - Index freshness (last indexed timestamps)
+        - Embedding model info and dimension
+        - Database size and WAL status
+    """
+    import os
+    # Get counts
+    symbols_count = db.execute("SELECT COUNT(*) FROM symbols").fetchone()[0]
+    files_count = db.execute("SELECT COUNT(*) FROM files").fetchone()[0]
+    doc_chunks_count = db.execute("SELECT COUNT(*) FROM doc_chunks").fetchone()[0]
+    doc_files_count = db.execute("SELECT COUNT(*) FROM doc_files").fetchone()[0]
+    references_count = db.execute("SELECT COUNT(*) FROM references_").fetchone()[0]
+    symbol_embeddings_count = db.execute("SELECT COUNT(*) FROM symbol_embeddings").fetchone()[0]
+    doc_embeddings_count = db.execute("SELECT COUNT(*) FROM doc_embeddings").fetchone()[0]
+    # Get symbol kinds distribution
+    symbol_kinds = dict(db.execute(
+        "SELECT kind, COUNT(*) FROM symbols GROUP BY kind ORDER BY COUNT(*) DESC"
+    ).fetchall())
+    # Get file types distribution (by extension)
+    file_extensions = dict(db.execute(
+        """SELECT substr(path, instr(path, '.')) as ext, COUNT(*) as cnt
+           FROM files
+           WHERE path LIKE '%.%'
+           GROUP BY ext
+           ORDER BY cnt DESC
+           LIMIT 10"""
+    ).fetchall())
+    # Get last indexed timestamps
+    last_file_indexed = db.execute(
+        "SELECT MAX(last_modified) FROM files"
+    ).fetchone()[0]
+    last_doc_indexed = db.execute(
+        "SELECT MAX(last_modified) FROM doc_files"
+    ).fetchone()[0]
+    # Get embedding model info
+    embedding_model = db.execute(
+        "SELECT value FROM index_metadata WHERE key = 'embedding_model'"
+    ).fetchone()
+    embedding_dim = db.execute(
+        "SELECT value FROM index_metadata WHERE key = 'embedding_dim'"
+    ).fetchone()
+    # Database file size
+    db_path = os.path.join(os.path.abspath(project_dir), "code_memory.db")
+    db_size_bytes = os.path.getsize(db_path) if os.path.exists(db_path) else 0
+    db_size_mb = round(db_size_bytes / (1024 * 1024), 2)
+    # WAL status
+    wal_path = db_path + "-wal"
+    wal_exists = os.path.exists(wal_path)
+    wal_size_mb = round(os.path.getsize(wal_path) / (1024 * 1024), 2) if wal_exists else 0
+    # Check journal mode
+    journal_mode = db.execute("PRAGMA journal_mode").fetchone()[0]
+    return {
+        "indexed": symbols_count > 0 or doc_chunks_count > 0,
+        "counts": {
+            "symbols": symbols_count,
+            "files": files_count,
+            "doc_chunks": doc_chunks_count,
+            "doc_files": doc_files_count,
+            "references": references_count,
+            "symbol_embeddings": symbol_embeddings_count,
+            "doc_embeddings": doc_embeddings_count,
+        },
+        "distributions": {
+            "symbol_kinds": symbol_kinds,
+            "file_extensions": file_extensions,
+        },
+        "freshness": {
+            "last_file_indexed": last_file_indexed,
+            "last_doc_indexed": last_doc_indexed,
+        },
+        "embedding": {
+            "model": embedding_model[0] if embedding_model else None,
+            "dimension": int(embedding_dim[0]) if embedding_dim else None,
+        },
+        "database": {
+            "size_mb": db_size_mb,
+            "journal_mode": journal_mode,
+            "wal_exists": wal_exists,
+            "wal_size_mb": wal_size_mb,
+        },
+    }

{code_memory-1.0.17 → code_memory-1.0.18}/logging_config.py RENAMED Viewed

@@ -153,9 +153,9 @@ class IndexingLogger:
     def __init__(self, indexer_type: str):
         self.indexer_type = indexer_type
         self.logger = get_logger("indexing")
-        self.files_processed = 0
+        self.files_newly_indexed = 0
         self.items_indexed = 0
-        self.files_skipped = 0
+        self.files_unchanged = 0
         self.start_time: datetime | None = None
     def start(self, directory: str) -> None:
@@ -165,13 +165,13 @@ class IndexingLogger:
     def file_indexed(self, filepath: str, items: int = 1) -> None:
         """Log successful file indexing."""
-        self.files_processed += 1
+        self.files_newly_indexed += 1
         self.items_indexed += items
         self.logger.debug(f"Indexed {self.indexer_type}: {filepath} ({items} items)")
     def file_skipped(self, filepath: str, reason: str) -> None:
         """Log skipped file."""
-        self.files_skipped += 1
+        self.files_unchanged += 1
         self.logger.debug(f"Skipped {self.indexer_type}: {filepath} ({reason})")
     def complete(self) -> None:
@@ -179,8 +179,8 @@ class IndexingLogger:
         duration_ms = (datetime.now() - self.start_time).total_seconds() * 1000 if self.start_time else 0
         self.logger.info(
             f"Completed {self.indexer_type} indexing: "
-            f"files={self.files_processed} items={self.items_indexed} "
-            f"skipped={self.files_skipped} duration={duration_ms:.1f}ms"
+            f"files={self.files_newly_indexed} items={self.items_indexed} "
+            f"unchanged={self.files_unchanged} duration={duration_ms:.1f}ms"
         )
     def error(self, filepath: str, error_msg: str) -> None:

{code_memory-1.0.17 → code_memory-1.0.18}/parser.py RENAMED Viewed

@@ -11,6 +11,7 @@ from __future__ import annotations
 import logging
 import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Any
@@ -21,6 +22,9 @@ import db as db_mod
 logger = logging.getLogger(__name__)
+# Number of worker threads for parallel indexing (configurable via env)
+MAX_WORKERS = int(os.environ.get("CODE_MEMORY_MAX_WORKERS", "4"))
 # ── Directories to always skip (even without .gitignore) ───────────────
 _SKIP_DIRS = frozenset({
     ".venv", "venv", "__pycache__", ".git", "node_modules",
@@ -452,7 +456,11 @@ def index_file(filepath: str, db) -> dict:
 # ---------------------------------------------------------------------------
 def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
-    """Recursively index all source files under *dirpath*.
+    """Recursively index all source files under *dirpath* using parallel processing.
+    Uses ThreadPoolExecutor for parallel file I/O and parsing, while keeping
+    embedding generation sequential (sentence-transformers releases GIL during
+    inference). Processes files in batches for embedding efficiency.
     Skips directories in ``_SKIP_DIRS``, files matching ``.gitignore`` patterns
     (including nested .gitignore files), and unchanged files.  Indexes any file
@@ -476,7 +484,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
     gitignore = GitignoreMatcher(dirpath)
     logger.debug("Initialized gitignore matcher for %s", dirpath)
-    # First pass: count total files for progress reporting
+    # First pass: collect all files to index
     total_files = 0
     file_list = []
     for root, dirs, files in os.walk(dirpath, topdown=True):
@@ -494,69 +502,243 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
                 file_list.append(os.path.join(root, fname))
                 total_files += 1
-    # Reset gitignore for actual indexing pass
-    gitignore = GitignoreMatcher(dirpath)
+    if not file_list:
+        return []
-    files_processed = 0
-    for root, dirs, files in os.walk(dirpath, topdown=True):
-        rel_root = os.path.relpath(root, dirpath)
-        # Check for .gitignore in current directory and load it
-        if rel_root != ".":
-            gitignore.check_dir_for_gitignore(root, rel_root)
+    # Report initial phase
+    if progress_callback:
+        progress_callback(0, total_files, "Scanning files for changes...")
-        # Prune skipped directories in-place (always-skip + gitignore)
-        def _should_keep_dir(d: str) -> bool:
-            if d in _SKIP_DIRS or d.endswith(".egg-info"):
-                return False
-            rel_path = os.path.join(rel_root, d) if rel_root != "." else d
-            if gitignore.should_skip(rel_path, is_dir=True):
-                return False
-            return True
+    # Phase 1: Parallel file freshness check and parsing
+    # Each worker returns parsed data (not yet stored to DB)
+    files_processed = 0
+    parsed_files: list[tuple[str, dict | None, Exception | None]] = []  # (filepath, parsed_data, error)
-        dirs[:] = [d for d in dirs if _should_keep_dir(d)]
+    def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
+        """Parse a single file and return extracted data (without DB writes)."""
+        try:
+            parsed = _parse_file_for_indexing(fpath, db)
+            return (fpath, parsed, None)
+        except Exception as e:
+            return (fpath, None, e)
-        for fname in sorted(files):
-            # Skip files matching .gitignore patterns
-            rel_path = os.path.join(rel_root, fname) if rel_root != "." else fname
-            if gitignore.should_skip(rel_path, is_dir=False):
-                continue
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        # Submit all parsing tasks
+        future_to_path = {executor.submit(_parse_file_task, fpath): fpath for fpath in file_list}
-            ext = os.path.splitext(fname)[1].lower()
-            # Accept files with known extensions, or files with a
-            # tree-sitter grammar available
-            if ext not in _SOURCE_EXTENSIONS and _load_language(ext) is None:
-                continue
+        for future in as_completed(future_to_path):
+            fpath, parsed_data, error = future.result()
+            parsed_files.append((fpath, parsed_data, error))
-            fpath = os.path.join(root, fname)
-            try:
-                result = index_file(fpath, db)
-                results.append(result)
-            except Exception:
-                logger.exception("Failed to index %s", fpath)
-                results.append({
-                    "file": fpath,
-                    "symbols_indexed": 0,
-                    "references_indexed": 0,
-                    "skipped": True,
-                    "error": True,
-                })
-            # Report progress
             files_processed += 1
             if progress_callback:
-                progress_callback(files_processed, total_files, f"Indexing code: {fname}")
+                fname = os.path.basename(fpath)
+                progress_callback(files_processed, total_files, f"Parsing: {fname}")
+    # Phase 2: Batch embedding generation (sequential, GIL released during inference)
+    if progress_callback:
+        progress_callback(total_files, total_files, "Generating embeddings...")
+    # Collect all texts that need embedding
+    embedding_batches: list[tuple[str, list[tuple]]] = []  # (filepath, [(embed_text, symbol_data), ...])
+    for fpath, parsed_data, error in parsed_files:
+        if error or parsed_data is None or parsed_data.get("skipped"):
+            continue
+        embed_inputs = []
+        for sym in parsed_data.get("symbols", []):
+            embed_input = f"{sym['kind']} {sym['name']}: {sym['source_text'][:1000]}"
+            embed_inputs.append((embed_input, sym))
+        if embed_inputs:
+            embedding_batches.append((fpath, embed_inputs, parsed_data))
+    # Generate embeddings in batch
+    all_embed_texts = []
+    for fpath, embed_inputs, _ in embedding_batches:
+        for embed_text, _ in embed_inputs:
+            all_embed_texts.append(embed_text)
+    all_embeddings = db_mod.embed_texts_batch(all_embed_texts, batch_size=64) if all_embed_texts else []
+    # Phase 3: Sequential DB writes (to avoid SQLite conflicts)
+    if progress_callback:
+        progress_callback(total_files, total_files, "Storing to database...")
+    embed_idx = 0
+    for fpath, parsed_data, error in parsed_files:
+        if error:
+            logger.exception("Failed to index %s", fpath)
+            results.append({
+                "file": fpath,
+                "symbols_indexed": 0,
+                "references_indexed": 0,
+                "skipped": True,
+                "error": True,
+            })
+            continue
+        if parsed_data is None or parsed_data.get("skipped"):
+            results.append({
+                "file": fpath,
+                "symbols_indexed": 0,
+                "references_indexed": 0,
+                "skipped": True,
+            })
+            continue
+        # Find embeddings for this file
+        file_result = _store_parsed_file(fpath, parsed_data, db, embedding_batches, all_embeddings, embed_idx)
+        embed_idx += len(parsed_data.get("symbols", []))
+        results.append(file_result)
     # Log performance summary
     total_elapsed = time.perf_counter() - total_start
     total_symbols = sum(r.get("symbols_indexed", 0) for r in results)
     total_refs = sum(r.get("references_indexed", 0) for r in results)
-    files_indexed = sum(1 for r in results if not r.get("skipped"))
-    files_skipped = sum(1 for r in results if r.get("skipped") and not r.get("error"))
-    logger.info(
-        "Indexed %d files (%d skipped) in %.2fs - %d symbols, %d references",
-        files_indexed, files_skipped, total_elapsed, total_symbols, total_refs
-    )
+    files_newly_indexed = sum(1 for r in results if not r.get("skipped"))
+    files_unchanged = sum(1 for r in results if r.get("skipped") and not r.get("error"))
+    if total_files > 0:
+        files_per_sec = total_files / total_elapsed if total_elapsed > 0 else 0
+        logger.info(
+            "Indexed %d files (%d unchanged) in %.2fs (%.1f files/s) - %d symbols, %d references",
+            files_newly_indexed, files_unchanged, total_elapsed, files_per_sec, total_symbols, total_refs
+        )
+    else:
+        logger.info(
+            "Indexed %d files (%d unchanged) in %.2fs - %d symbols, %d references",
+            files_newly_indexed, files_unchanged, total_elapsed, total_symbols, total_refs
+        )
     return results
+def _parse_file_for_indexing(filepath: str, db) -> dict | None:
+    """Parse a file and extract symbols/references without DB writes.
+    Returns parsed data structure or None if skipped.
+    """
+    filepath = os.path.abspath(filepath)
+    ext = os.path.splitext(filepath)[1].lower()
+    # Check freshness
+    mtime = os.path.getmtime(filepath)
+    row = db.execute(
+        "SELECT id, last_modified FROM files WHERE path = ?", (filepath,)
+    ).fetchone()
+    if row and row[1] >= mtime:
+        return {"skipped": True, "file_id": row[0]}
+    # Read file
+    source_bytes = Path(filepath).read_bytes()
+    source_text = source_bytes.decode("utf-8", errors="replace")
+    fhash = db_mod.file_hash(filepath)
+    result = {
+        "skipped": False,
+        "mtime": mtime,
+        "fhash": fhash,
+        "symbols": [],
+        "references": [],
+        "fallback": False,
+    }
+    # Try tree-sitter parsing
+    lang = _load_language(ext)
+    if lang is not None:
+        parser = Parser(lang)
+        tree = parser.parse(source_bytes)
+        # Extract symbols (flat list for batch processing)
+        raw_symbols = _extract_symbols(tree.root_node, source_bytes)
+        all_symbols: list[dict] = []
+        def _collect_symbols(sym_list):
+            for sym in sym_list:
+                all_symbols.append(sym)
+                if sym.get("children"):
+                    _collect_symbols(sym["children"])
+        _collect_symbols(raw_symbols)
+        result["symbols"] = all_symbols
+        # Extract references
+        refs = _extract_references(tree.root_node, source_bytes)
+        result["references"] = refs
+    else:
+        # Fallback: entire file as one symbol
+        basename = os.path.basename(filepath)
+        result["symbols"] = [{
+            "name": basename,
+            "kind": "file",
+            "line_start": 1,
+            "line_end": source_text.count("\n") + 1,
+            "source_text": source_text[:5000],
+            "parent_id": None,
+        }]
+        result["fallback"] = True
+    return result
+def _store_parsed_file(
+    filepath: str,
+    parsed_data: dict,
+    db,
+    embedding_batches: list,
+    all_embeddings: list,
+    start_embed_idx: int
+) -> dict:
+    """Store parsed file data to database with pre-computed embeddings."""
+    filepath = os.path.abspath(filepath)
+    # Upsert file record
+    file_id = db_mod.upsert_file(db, filepath, parsed_data["mtime"], parsed_data["fhash"])
+    # Delete stale data
+    db_mod.delete_file_data(db, file_id)
+    symbols_indexed = 0
+    references_indexed = 0
+    # Find embeddings for this file
+    file_embeddings = None
+    embed_offset = 0
+    for bfpath, embed_inputs, _ in embedding_batches:
+        if bfpath == filepath:
+            file_embeddings = all_embeddings[start_embed_idx + embed_offset:start_embed_idx + embed_offset + len(embed_inputs)]
+            break
+        embed_offset += len(embed_inputs)
+    # Store symbols with embeddings
+    if parsed_data.get("symbols") and file_embeddings:
+        with db_mod.transaction(db):
+            for i, sym in enumerate(parsed_data["symbols"]):
+                sym_id = db_mod.upsert_symbol(
+                    db, sym["name"], sym["kind"], file_id,
+                    sym["line_start"], sym["line_end"],
+                    sym.get("parent_id"), sym["source_text"],
+                    auto_commit=False
+                )
+                if i < len(file_embeddings):
+                    db_mod.upsert_embedding(db, sym_id, file_embeddings[i], auto_commit=False)
+                symbols_indexed += 1
+    # Store references
+    if parsed_data.get("references"):
+        with db_mod.transaction(db):
+            for ref in parsed_data["references"]:
+                db_mod.upsert_reference(db, ref["name"], file_id, ref["line"], auto_commit=False)
+                references_indexed += 1
+    return {
+        "file": filepath,
+        "symbols_indexed": symbols_indexed,
+        "references_indexed": references_indexed,
+        "skipped": False,
+    }

{code_memory-1.0.17 → code_memory-1.0.18}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "code-memory"
-version = "1.0.17"
+version = "1.0.18"
 description = "A deterministic, high-precision code intelligence MCP server"
 readme = "README.md"
 license = "MIT"

{code_memory-1.0.17 → code_memory-1.0.18}/queries.py RENAMED Viewed

@@ -108,7 +108,8 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
         top_k: Number of results to return.
     Returns:
-        A list of result dicts sorted by descending RRF score.
+        A list of result dicts sorted by descending RRF score, including
+        match_reason, match_highlights, and confidence.
     """
     bm25_results = _bm25_search(query, db, top_k=50)
     vec_results = _vector_search(query, db, top_k=50)
@@ -116,6 +117,7 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
     # Build RRF score map keyed by symbol_id
     scores: dict[int, float] = {}
     details: dict[int, dict] = {}
+    match_sources: dict[int, list[str]] = {}  # Track which search found each result
     for rank, r in enumerate(bm25_results, start=1):
         sid = r["symbol_id"]
@@ -128,6 +130,8 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
             "line_end": r["line_end"],
             "source_text": r["source_text"],
         }
+        match_sources[sid] = match_sources.get(sid, [])
+        match_sources[sid].append("bm25")
     for rank, r in enumerate(vec_results, start=1):
         sid = r["symbol_id"]
@@ -141,14 +145,112 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
                 "line_end": r["line_end"],
                 "source_text": r["source_text"],
             }
+        match_sources[sid] = match_sources.get(sid, [])
+        if "vector" not in match_sources[sid]:
+            match_sources[sid].append("vector")
     # Sort by descending RRF score
     ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_k]
-    return [
-        {**details[sid], "score": round(score, 6)}
-        for sid, score in ranked
-    ]
+    # Build results with match metadata
+    results = []
+    for sid, score in ranked:
+        sources = match_sources.get(sid, [])
+        is_hybrid = len(sources) == 2
+        # Determine match reason
+        if is_hybrid:
+            match_reason = "hybrid (BM25 + semantic)"
+        elif "bm25" in sources:
+            match_reason = "keyword match (BM25)"
+        else:
+            match_reason = "semantic match (vector)"
+        # Calculate confidence (normalize RRF score to 0-1 range)
+        # Max possible RRF score for a single source is 1/61 ≈ 0.0164
+        # For hybrid it's 2/61 ≈ 0.0328. We normalize accordingly.
+        max_single_rrf = 1.0 / (_RRF_K + 1)  # ≈ 0.0164
+        max_hybrid_rrf = 2.0 * max_single_rrf  # ≈ 0.0328
+        if is_hybrid:
+            confidence = min(1.0, score / max_hybrid_rrf)
+        else:
+            confidence = min(1.0, (score / max_single_rrf) * 0.7)  # Cap single-source at 0.7
+        result = {
+            **details[sid],
+            "score": round(score, 6),
+            "match_reason": match_reason,
+            "confidence": round(confidence, 3),
+            "match_highlights": [],  # Will be populated below if BM25 match
+        }
+        # Get highlights for BM25 matches using FTS5 highlight function
+        if "bm25" in sources:
+            highlights = _get_bm25_highlights(query, details[sid]["source_text"], db)
+            result["match_highlights"] = highlights
+        results.append(result)
+    return results
+def _get_bm25_highlights(query: str, source_text: str, db) -> list[str]:
+    """Extract highlighted snippets using FTS5.
+    Returns up to 3 highlighted text snippets showing where the query matched.
+    """
+    if not source_text or not query:
+        return []
+    # Use FTS5 highlight function to get matched portions
+    safe_query = query.replace('"', '""')
+    try:
+        # Create a temporary FTS5 query to get highlights
+        # We use the snippet function which returns highlighted fragments
+        rows = db.execute(
+            """
+            SELECT snippet(symbols_fts, 1, '>>>', '<<<', '...', 20) as highlight
+            FROM symbols_fts
+            WHERE symbols_fts MATCH ?
+            LIMIT 3
+            """,
+            (safe_query,),
+        ).fetchall()
+        highlights = []
+        for row in rows:
+            if row[0] and row[0] not in ("...", ""):
+                # Clean up the highlight markers for readability
+                highlight = row[0].replace(">>>", "**").replace("<<<", "**")
+                if len(highlight) > 10:  # Only include meaningful highlights
+                    highlights.append(highlight)
+        return highlights[:3]  # Return at most 3 highlights
+    except Exception:
+        # Fallback: find query terms in source text
+        return _simple_highlights(query, source_text)
+def _simple_highlights(query: str, source_text: str) -> list[str]:
+    """Simple fallback highlight extraction when FTS5 isn't available."""
+    highlights = []
+    query_terms = query.lower().split()
+    lines = source_text.split("\n")
+    for line in lines[:20]:  # Check first 20 lines
+        line_lower = line.lower()
+        for term in query_terms:
+            if term in line_lower and len(line.strip()) > 10:
+                # Truncate long lines
+                snippet = line.strip()[:100]
+                if len(snippet) > 50:
+                    snippet = snippet[:97] + "..."
+                highlights.append(snippet)
+                break
+        if len(highlights) >= 3:
+            break
+    return highlights[:3]
 # ---------------------------------------------------------------------------

{code_memory-1.0.17 → code_memory-1.0.18}/server.py RENAMED Viewed

@@ -128,6 +128,45 @@ def check_index_status(directory: str) -> dict:
         }
+# ── Tool 0.5: get_index_stats ─────────────────────────────────────────────
+@mcp.tool()
+def get_index_stats(directory: str) -> dict:
+    """USE THIS TOOL to get comprehensive statistics about the code index.
+    This tool provides detailed metrics about the index health, including
+    file counts, symbol distributions, embedding model info, and database size.
+    TRIGGER - Call this tool when:
+    - You want to understand what's in the index
+    - Debugging search quality issues
+    - Checking index freshness or coverage
+    - Monitoring database size and health
+    Do NOT use this tool for:
+    - Checking if indexing is needed (use check_index_status)
+    - Searching for code (use search_code)
+    Args:
+        directory: Path to the project directory.
+    Returns:
+        Dictionary with:
+        - indexed: boolean - true if anything has been indexed
+        - counts: Symbol, file, chunk, and embedding counts
+        - distributions: Symbol kinds and file extensions
+        - freshness: Last indexed timestamps
+        - embedding: Model name and dimension
+        - database: Size, journal mode, and WAL status
+    """
+    with logging_config.ToolLogger("get_index_stats", directory=directory):
+        try:
+            database = db_mod.get_db(directory)
+            stats = db_mod.get_index_stats(database, directory)
+            return {"status": "ok", **stats}
+        except Exception as e:
+            return errors.format_error(e)
 # ── Tool 1: search_code ───────────────────────────────────────────────────
 @mcp.tool()
 def search_code(
@@ -294,6 +333,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
     - Enables semantic search via vector embeddings
     - Builds cross-reference graphs for "find all usages" queries
     - Incremental indexing: unchanged files are automatically skipped
+    - PARALLEL PROCESSING: Uses thread pool for faster indexing
     Do NOT use this tool for:
     - Non-code files (images, binaries, data files)
@@ -306,6 +346,8 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
     Returns:
         Summary with files_indexed, total_symbols, total_chunks, and details.
     """
+    import time
     with logging_config.ToolLogger("index_codebase", directory=directory) as log:
         try:
             # Validate directory
@@ -313,20 +355,38 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
             database = db_mod.get_db(str(directory_path))
+            # Track timing for throughput calculation
+            start_time = time.perf_counter()
             # Report initial progress
             await ctx.report_progress(0, 100, "Starting indexing...")
             # Create progress callback that schedules progress updates on the event loop
             loop = asyncio.get_running_loop()
-            progress_state = {"current": 0, "total": 0, "phase": "code"}
+            progress_state = {"current": 0, "total": 0, "phase": "scanning"}
             def sync_progress_callback(current: int, total: int, message: str):
-                """Sync callback that schedules async progress reporting."""
+                """Sync callback that schedules async progress reporting with throughput info."""
                 progress_state["current"] = current
                 progress_state["total"] = total
+                # Calculate throughput and ETA
+                elapsed = time.perf_counter() - start_time
+                if elapsed > 0 and current > 0:
+                    files_per_sec = current / elapsed
+                    if files_per_sec > 0 and total > current:
+                        remaining_files = total - current
+                        eta_seconds = remaining_files / files_per_sec
+                        eta_str = f", ETA: {int(eta_seconds)}s" if eta_seconds < 60 else f", ETA: {int(eta_seconds / 60)}m"
+                    else:
+                        eta_str = ""
+                    throughput_str = f" ({files_per_sec:.1f} files/s{eta_str})"
+                else:
+                    throughput_str = ""
                 # Schedule the async progress report on the event loop
                 asyncio.run_coroutine_threadsafe(
-                    ctx.report_progress(current, total, message),
+                    ctx.report_progress(current, total, f"{message}{throughput_str}"),
                     loop
                 )
@@ -334,7 +394,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
             code_logger = logging_config.IndexingLogger("code")
             code_logger.start(str(directory_path))
-            await ctx.report_progress(0, 100, "Scanning code files...")
+            await ctx.report_progress(0, 100, "Phase 1/3: Scanning code files...")
             code_results = await asyncio.to_thread(
                 parser_mod.index_directory,
@@ -361,7 +421,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
             code_file_count = len(code_results)
             doc_progress_offset = code_file_count
-            await ctx.report_progress(code_file_count, code_file_count, "Scanning documentation files...")
+            await ctx.report_progress(code_file_count, code_file_count, "Phase 2/3: Scanning documentation files...")
             doc_results = await asyncio.to_thread(
                 doc_parser_mod.index_doc_directory,
@@ -383,7 +443,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
             doc_skipped = [r for r in doc_results if r.get("skipped")]
             # Extract docstrings from indexed code
-            await ctx.report_progress(0, 0, "Extracting docstrings...")
+            await ctx.report_progress(0, 0, "Phase 3/3: Extracting docstrings...")
             docstring_results = await asyncio.to_thread(
                 doc_parser_mod.extract_docstrings_from_code,
                 database
@@ -393,20 +453,36 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
             total_chunks = sum(r.get("chunks_indexed", 0) for r in doc_indexed)
             log.set_result_count(total_symbols + total_chunks + len(docstring_results))
-            await ctx.report_progress(100, 100, "Indexing complete!")
+            # Calculate final throughput
+            total_elapsed = time.perf_counter() - start_time
+            total_files = len(code_results) + len(doc_results)
+            files_per_sec = total_files / total_elapsed if total_elapsed > 0 else 0
+            await ctx.report_progress(100, 100, f"Indexing complete! ({files_per_sec:.1f} files/s)")
+            # Get total indexed counts from database for cumulative stats
+            total_code_files = database.execute("SELECT COUNT(*) FROM files").fetchone()[0]
+            total_doc_files = database.execute("SELECT COUNT(*) FROM doc_files").fetchone()[0]
             return {
                 "status": "ok",
                 "directory": str(directory_path),
+                "performance": {
+                    "total_time_seconds": round(total_elapsed, 2),
+                    "files_per_second": round(files_per_sec, 1),
+                    "total_files_processed": total_files,
+                },
                 "code": {
-                    "files_indexed": len(indexed),
-                    "files_skipped": len(skipped),
+                    "files_newly_indexed": len(indexed),
+                    "files_unchanged": len(skipped),
+                    "total_indexed_files": total_code_files,
                     "total_symbols": total_symbols,
                     "total_references": sum(r.get("references_indexed", 0) for r in indexed),
                 },
                 "documentation": {
-                    "files_indexed": len(doc_indexed),
-                    "files_skipped": len(doc_skipped),
+                    "files_newly_indexed": len(doc_indexed),
+                    "files_unchanged": len(doc_skipped),
+                    "total_indexed_files": total_doc_files,
                     "total_chunks": total_chunks,
                     "docstrings_extracted": len(docstring_results),
                 },

{code_memory-1.0.17 → code_memory-1.0.18}/tests/test_logging.py RENAMED Viewed

@@ -133,19 +133,19 @@ class TestToolLogger:
 class TestIndexingLogger:
     """Tests for IndexingLogger class."""
-    def test_tracks_files_processed(self):
-        """Test that files processed are tracked."""
+    def test_tracks_files_newly_indexed(self):
+        """Test that files newly indexed are tracked."""
         idx_logger = logging_config.IndexingLogger("test")
         idx_logger.file_indexed("file1.py", 3)
         idx_logger.file_indexed("file2.py", 2)
-        assert idx_logger.files_processed == 2
+        assert idx_logger.files_newly_indexed == 2
         assert idx_logger.items_indexed == 5
-    def test_tracks_files_skipped(self):
-        """Test that files skipped are tracked."""
+    def test_tracks_files_unchanged(self):
+        """Test that files unchanged are tracked."""
         idx_logger = logging_config.IndexingLogger("test")
         idx_logger.file_skipped("file1.py", "unchanged")
-        assert idx_logger.files_skipped == 1
+        assert idx_logger.files_unchanged == 1
 class TestPreconfiguredLoggers:

{code_memory-1.0.17 → code_memory-1.0.18}/.github/workflows/ci.yml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/.github/workflows/publish.yml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/.github/workflows/release-binaries.yml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/.gitignore RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/.python-version RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/CHANGELOG.md RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/CONTRIBUTING.md RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/LICENSE RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/Makefile RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/README.md RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/assets/logo.png RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/code-memory.spec RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/doc_parser.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/errors.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/git_search.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-sentence_transformers.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-sqlite_vec.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-tree_sitter.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-tree_sitter_languages.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_1.xml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_2.xml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_3.xml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_4.xml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_5.xml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_6.xml RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/tests/__init__.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/tests/conftest.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/tests/test_errors.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/tests/test_tools.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/tests/test_validation.py RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/uv.lock RENAMED Viewed

File without changes

{code_memory-1.0.17 → code_memory-1.0.18}/validation.py RENAMED Viewed

File without changes

code-memory 1.0.17__tar.gz → 1.0.18__tar.gz

code-memory 1.0.17tar.gz → 1.0.18tar.gz