code-memory 1.0.17__tar.gz → 1.0.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {code_memory-1.0.17 → code_memory-1.0.18}/PKG-INFO +1 -1
  2. {code_memory-1.0.17 → code_memory-1.0.18}/db.py +105 -0
  3. {code_memory-1.0.17 → code_memory-1.0.18}/logging_config.py +6 -6
  4. {code_memory-1.0.17 → code_memory-1.0.18}/parser.py +235 -53
  5. {code_memory-1.0.17 → code_memory-1.0.18}/pyproject.toml +1 -1
  6. {code_memory-1.0.17 → code_memory-1.0.18}/queries.py +107 -5
  7. {code_memory-1.0.17 → code_memory-1.0.18}/server.py +87 -11
  8. {code_memory-1.0.17 → code_memory-1.0.18}/tests/test_logging.py +6 -6
  9. {code_memory-1.0.17 → code_memory-1.0.18}/.github/workflows/ci.yml +0 -0
  10. {code_memory-1.0.17 → code_memory-1.0.18}/.github/workflows/publish.yml +0 -0
  11. {code_memory-1.0.17 → code_memory-1.0.18}/.github/workflows/release-binaries.yml +0 -0
  12. {code_memory-1.0.17 → code_memory-1.0.18}/.gitignore +0 -0
  13. {code_memory-1.0.17 → code_memory-1.0.18}/.python-version +0 -0
  14. {code_memory-1.0.17 → code_memory-1.0.18}/CHANGELOG.md +0 -0
  15. {code_memory-1.0.17 → code_memory-1.0.18}/CONTRIBUTING.md +0 -0
  16. {code_memory-1.0.17 → code_memory-1.0.18}/LICENSE +0 -0
  17. {code_memory-1.0.17 → code_memory-1.0.18}/Makefile +0 -0
  18. {code_memory-1.0.17 → code_memory-1.0.18}/README.md +0 -0
  19. {code_memory-1.0.17 → code_memory-1.0.18}/assets/logo.png +0 -0
  20. {code_memory-1.0.17 → code_memory-1.0.18}/code-memory.spec +0 -0
  21. {code_memory-1.0.17 → code_memory-1.0.18}/doc_parser.py +0 -0
  22. {code_memory-1.0.17 → code_memory-1.0.18}/errors.py +0 -0
  23. {code_memory-1.0.17 → code_memory-1.0.18}/git_search.py +0 -0
  24. {code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-sentence_transformers.py +0 -0
  25. {code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-sqlite_vec.py +0 -0
  26. {code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-tree_sitter.py +0 -0
  27. {code_memory-1.0.17 → code_memory-1.0.18}/hooks/hook-tree_sitter_languages.py +0 -0
  28. {code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_1.xml +0 -0
  29. {code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_2.xml +0 -0
  30. {code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_3.xml +0 -0
  31. {code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_4.xml +0 -0
  32. {code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_5.xml +0 -0
  33. {code_memory-1.0.17 → code_memory-1.0.18}/prompts/milestone_6.xml +0 -0
  34. {code_memory-1.0.17 → code_memory-1.0.18}/tests/__init__.py +0 -0
  35. {code_memory-1.0.17 → code_memory-1.0.18}/tests/conftest.py +0 -0
  36. {code_memory-1.0.17 → code_memory-1.0.18}/tests/test_errors.py +0 -0
  37. {code_memory-1.0.17 → code_memory-1.0.18}/tests/test_tools.py +0 -0
  38. {code_memory-1.0.17 → code_memory-1.0.18}/tests/test_validation.py +0 -0
  39. {code_memory-1.0.17 → code_memory-1.0.18}/uv.lock +0 -0
  40. {code_memory-1.0.17 → code_memory-1.0.18}/validation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-memory
3
- Version: 1.0.17
3
+ Version: 1.0.18
4
4
  Summary: A deterministic, high-precision code intelligence MCP server
5
5
  Project-URL: Homepage, https://github.com/kapillamba4/code-memory
6
6
  Project-URL: Documentation, https://github.com/kapillamba4/code-memory#readme
@@ -638,3 +638,108 @@ def upsert_doc_embedding(
638
638
  )
639
639
  if auto_commit:
640
640
  db.commit()
641
+
642
+
643
+ # ---------------------------------------------------------------------------
644
+ # Index Statistics
645
+ # ---------------------------------------------------------------------------
646
+
647
+ def get_index_stats(db: sqlite3.Connection, project_dir: str) -> dict:
648
+ """Get comprehensive statistics about the index.
649
+
650
+ Args:
651
+ db: An open sqlite3.Connection.
652
+ project_dir: The project directory path.
653
+
654
+ Returns:
655
+ Dictionary with index health metrics including:
656
+ - Total symbols, files, doc chunks indexed
657
+ - Index freshness (last indexed timestamps)
658
+ - Embedding model info and dimension
659
+ - Database size and WAL status
660
+ """
661
+ import os
662
+
663
+ # Get counts
664
+ symbols_count = db.execute("SELECT COUNT(*) FROM symbols").fetchone()[0]
665
+ files_count = db.execute("SELECT COUNT(*) FROM files").fetchone()[0]
666
+ doc_chunks_count = db.execute("SELECT COUNT(*) FROM doc_chunks").fetchone()[0]
667
+ doc_files_count = db.execute("SELECT COUNT(*) FROM doc_files").fetchone()[0]
668
+ references_count = db.execute("SELECT COUNT(*) FROM references_").fetchone()[0]
669
+ symbol_embeddings_count = db.execute("SELECT COUNT(*) FROM symbol_embeddings").fetchone()[0]
670
+ doc_embeddings_count = db.execute("SELECT COUNT(*) FROM doc_embeddings").fetchone()[0]
671
+
672
+ # Get symbol kinds distribution
673
+ symbol_kinds = dict(db.execute(
674
+ "SELECT kind, COUNT(*) FROM symbols GROUP BY kind ORDER BY COUNT(*) DESC"
675
+ ).fetchall())
676
+
677
+ # Get file types distribution (by extension)
678
+ file_extensions = dict(db.execute(
679
+ """SELECT substr(path, instr(path, '.')) as ext, COUNT(*) as cnt
680
+ FROM files
681
+ WHERE path LIKE '%.%'
682
+ GROUP BY ext
683
+ ORDER BY cnt DESC
684
+ LIMIT 10"""
685
+ ).fetchall())
686
+
687
+ # Get last indexed timestamps
688
+ last_file_indexed = db.execute(
689
+ "SELECT MAX(last_modified) FROM files"
690
+ ).fetchone()[0]
691
+ last_doc_indexed = db.execute(
692
+ "SELECT MAX(last_modified) FROM doc_files"
693
+ ).fetchone()[0]
694
+
695
+ # Get embedding model info
696
+ embedding_model = db.execute(
697
+ "SELECT value FROM index_metadata WHERE key = 'embedding_model'"
698
+ ).fetchone()
699
+ embedding_dim = db.execute(
700
+ "SELECT value FROM index_metadata WHERE key = 'embedding_dim'"
701
+ ).fetchone()
702
+
703
+ # Database file size
704
+ db_path = os.path.join(os.path.abspath(project_dir), "code_memory.db")
705
+ db_size_bytes = os.path.getsize(db_path) if os.path.exists(db_path) else 0
706
+ db_size_mb = round(db_size_bytes / (1024 * 1024), 2)
707
+
708
+ # WAL status
709
+ wal_path = db_path + "-wal"
710
+ wal_exists = os.path.exists(wal_path)
711
+ wal_size_mb = round(os.path.getsize(wal_path) / (1024 * 1024), 2) if wal_exists else 0
712
+
713
+ # Check journal mode
714
+ journal_mode = db.execute("PRAGMA journal_mode").fetchone()[0]
715
+
716
+ return {
717
+ "indexed": symbols_count > 0 or doc_chunks_count > 0,
718
+ "counts": {
719
+ "symbols": symbols_count,
720
+ "files": files_count,
721
+ "doc_chunks": doc_chunks_count,
722
+ "doc_files": doc_files_count,
723
+ "references": references_count,
724
+ "symbol_embeddings": symbol_embeddings_count,
725
+ "doc_embeddings": doc_embeddings_count,
726
+ },
727
+ "distributions": {
728
+ "symbol_kinds": symbol_kinds,
729
+ "file_extensions": file_extensions,
730
+ },
731
+ "freshness": {
732
+ "last_file_indexed": last_file_indexed,
733
+ "last_doc_indexed": last_doc_indexed,
734
+ },
735
+ "embedding": {
736
+ "model": embedding_model[0] if embedding_model else None,
737
+ "dimension": int(embedding_dim[0]) if embedding_dim else None,
738
+ },
739
+ "database": {
740
+ "size_mb": db_size_mb,
741
+ "journal_mode": journal_mode,
742
+ "wal_exists": wal_exists,
743
+ "wal_size_mb": wal_size_mb,
744
+ },
745
+ }
@@ -153,9 +153,9 @@ class IndexingLogger:
153
153
  def __init__(self, indexer_type: str):
154
154
  self.indexer_type = indexer_type
155
155
  self.logger = get_logger("indexing")
156
- self.files_processed = 0
156
+ self.files_newly_indexed = 0
157
157
  self.items_indexed = 0
158
- self.files_skipped = 0
158
+ self.files_unchanged = 0
159
159
  self.start_time: datetime | None = None
160
160
 
161
161
  def start(self, directory: str) -> None:
@@ -165,13 +165,13 @@ class IndexingLogger:
165
165
 
166
166
  def file_indexed(self, filepath: str, items: int = 1) -> None:
167
167
  """Log successful file indexing."""
168
- self.files_processed += 1
168
+ self.files_newly_indexed += 1
169
169
  self.items_indexed += items
170
170
  self.logger.debug(f"Indexed {self.indexer_type}: {filepath} ({items} items)")
171
171
 
172
172
  def file_skipped(self, filepath: str, reason: str) -> None:
173
173
  """Log skipped file."""
174
- self.files_skipped += 1
174
+ self.files_unchanged += 1
175
175
  self.logger.debug(f"Skipped {self.indexer_type}: {filepath} ({reason})")
176
176
 
177
177
  def complete(self) -> None:
@@ -179,8 +179,8 @@ class IndexingLogger:
179
179
  duration_ms = (datetime.now() - self.start_time).total_seconds() * 1000 if self.start_time else 0
180
180
  self.logger.info(
181
181
  f"Completed {self.indexer_type} indexing: "
182
- f"files={self.files_processed} items={self.items_indexed} "
183
- f"skipped={self.files_skipped} duration={duration_ms:.1f}ms"
182
+ f"files={self.files_newly_indexed} items={self.items_indexed} "
183
+ f"unchanged={self.files_unchanged} duration={duration_ms:.1f}ms"
184
184
  )
185
185
 
186
186
  def error(self, filepath: str, error_msg: str) -> None:
@@ -11,6 +11,7 @@ from __future__ import annotations
11
11
 
12
12
  import logging
13
13
  import os
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
15
  from pathlib import Path
15
16
  from typing import Any
16
17
 
@@ -21,6 +22,9 @@ import db as db_mod
21
22
 
22
23
  logger = logging.getLogger(__name__)
23
24
 
25
+ # Number of worker threads for parallel indexing (configurable via env)
26
+ MAX_WORKERS = int(os.environ.get("CODE_MEMORY_MAX_WORKERS", "4"))
27
+
24
28
  # ── Directories to always skip (even without .gitignore) ───────────────
25
29
  _SKIP_DIRS = frozenset({
26
30
  ".venv", "venv", "__pycache__", ".git", "node_modules",
@@ -452,7 +456,11 @@ def index_file(filepath: str, db) -> dict:
452
456
  # ---------------------------------------------------------------------------
453
457
 
454
458
  def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
455
- """Recursively index all source files under *dirpath*.
459
+ """Recursively index all source files under *dirpath* using parallel processing.
460
+
461
+ Uses ThreadPoolExecutor for parallel file I/O and parsing, while keeping
462
+ embedding generation sequential (sentence-transformers releases GIL during
463
+ inference). Processes files in batches for embedding efficiency.
456
464
 
457
465
  Skips directories in ``_SKIP_DIRS``, files matching ``.gitignore`` patterns
458
466
  (including nested .gitignore files), and unchanged files. Indexes any file
@@ -476,7 +484,7 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
476
484
  gitignore = GitignoreMatcher(dirpath)
477
485
  logger.debug("Initialized gitignore matcher for %s", dirpath)
478
486
 
479
- # First pass: count total files for progress reporting
487
+ # First pass: collect all files to index
480
488
  total_files = 0
481
489
  file_list = []
482
490
  for root, dirs, files in os.walk(dirpath, topdown=True):
@@ -494,69 +502,243 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
494
502
  file_list.append(os.path.join(root, fname))
495
503
  total_files += 1
496
504
 
497
- # Reset gitignore for actual indexing pass
498
- gitignore = GitignoreMatcher(dirpath)
505
+ if not file_list:
506
+ return []
499
507
 
500
- files_processed = 0
501
- for root, dirs, files in os.walk(dirpath, topdown=True):
502
- rel_root = os.path.relpath(root, dirpath)
503
-
504
- # Check for .gitignore in current directory and load it
505
- if rel_root != ".":
506
- gitignore.check_dir_for_gitignore(root, rel_root)
508
+ # Report initial phase
509
+ if progress_callback:
510
+ progress_callback(0, total_files, "Scanning files for changes...")
507
511
 
508
- # Prune skipped directories in-place (always-skip + gitignore)
509
- def _should_keep_dir(d: str) -> bool:
510
- if d in _SKIP_DIRS or d.endswith(".egg-info"):
511
- return False
512
- rel_path = os.path.join(rel_root, d) if rel_root != "." else d
513
- if gitignore.should_skip(rel_path, is_dir=True):
514
- return False
515
- return True
512
+ # Phase 1: Parallel file freshness check and parsing
513
+ # Each worker returns parsed data (not yet stored to DB)
514
+ files_processed = 0
515
+ parsed_files: list[tuple[str, dict | None, Exception | None]] = [] # (filepath, parsed_data, error)
516
516
 
517
- dirs[:] = [d for d in dirs if _should_keep_dir(d)]
517
+ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
518
+ """Parse a single file and return extracted data (without DB writes)."""
519
+ try:
520
+ parsed = _parse_file_for_indexing(fpath, db)
521
+ return (fpath, parsed, None)
522
+ except Exception as e:
523
+ return (fpath, None, e)
518
524
 
519
- for fname in sorted(files):
520
- # Skip files matching .gitignore patterns
521
- rel_path = os.path.join(rel_root, fname) if rel_root != "." else fname
522
- if gitignore.should_skip(rel_path, is_dir=False):
523
- continue
525
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
526
+ # Submit all parsing tasks
527
+ future_to_path = {executor.submit(_parse_file_task, fpath): fpath for fpath in file_list}
524
528
 
525
- ext = os.path.splitext(fname)[1].lower()
526
- # Accept files with known extensions, or files with a
527
- # tree-sitter grammar available
528
- if ext not in _SOURCE_EXTENSIONS and _load_language(ext) is None:
529
- continue
529
+ for future in as_completed(future_to_path):
530
+ fpath, parsed_data, error = future.result()
531
+ parsed_files.append((fpath, parsed_data, error))
530
532
 
531
- fpath = os.path.join(root, fname)
532
- try:
533
- result = index_file(fpath, db)
534
- results.append(result)
535
- except Exception:
536
- logger.exception("Failed to index %s", fpath)
537
- results.append({
538
- "file": fpath,
539
- "symbols_indexed": 0,
540
- "references_indexed": 0,
541
- "skipped": True,
542
- "error": True,
543
- })
544
-
545
- # Report progress
546
533
  files_processed += 1
547
534
  if progress_callback:
548
- progress_callback(files_processed, total_files, f"Indexing code: {fname}")
535
+ fname = os.path.basename(fpath)
536
+ progress_callback(files_processed, total_files, f"Parsing: {fname}")
537
+
538
+ # Phase 2: Batch embedding generation (sequential, GIL released during inference)
539
+ if progress_callback:
540
+ progress_callback(total_files, total_files, "Generating embeddings...")
541
+
542
+ # Collect all texts that need embedding
543
+ embedding_batches: list[tuple[str, list[tuple]]] = [] # (filepath, [(embed_text, symbol_data), ...])
544
+
545
+ for fpath, parsed_data, error in parsed_files:
546
+ if error or parsed_data is None or parsed_data.get("skipped"):
547
+ continue
548
+
549
+ embed_inputs = []
550
+ for sym in parsed_data.get("symbols", []):
551
+ embed_input = f"{sym['kind']} {sym['name']}: {sym['source_text'][:1000]}"
552
+ embed_inputs.append((embed_input, sym))
553
+
554
+ if embed_inputs:
555
+ embedding_batches.append((fpath, embed_inputs, parsed_data))
556
+
557
+ # Generate embeddings in batch
558
+ all_embed_texts = []
559
+ for fpath, embed_inputs, _ in embedding_batches:
560
+ for embed_text, _ in embed_inputs:
561
+ all_embed_texts.append(embed_text)
562
+
563
+ all_embeddings = db_mod.embed_texts_batch(all_embed_texts, batch_size=64) if all_embed_texts else []
564
+
565
+ # Phase 3: Sequential DB writes (to avoid SQLite conflicts)
566
+ if progress_callback:
567
+ progress_callback(total_files, total_files, "Storing to database...")
568
+
569
+ embed_idx = 0
570
+ for fpath, parsed_data, error in parsed_files:
571
+ if error:
572
+ logger.exception("Failed to index %s", fpath)
573
+ results.append({
574
+ "file": fpath,
575
+ "symbols_indexed": 0,
576
+ "references_indexed": 0,
577
+ "skipped": True,
578
+ "error": True,
579
+ })
580
+ continue
581
+
582
+ if parsed_data is None or parsed_data.get("skipped"):
583
+ results.append({
584
+ "file": fpath,
585
+ "symbols_indexed": 0,
586
+ "references_indexed": 0,
587
+ "skipped": True,
588
+ })
589
+ continue
590
+
591
+ # Find embeddings for this file
592
+ file_result = _store_parsed_file(fpath, parsed_data, db, embedding_batches, all_embeddings, embed_idx)
593
+ embed_idx += len(parsed_data.get("symbols", []))
594
+ results.append(file_result)
549
595
 
550
596
  # Log performance summary
551
597
  total_elapsed = time.perf_counter() - total_start
552
598
  total_symbols = sum(r.get("symbols_indexed", 0) for r in results)
553
599
  total_refs = sum(r.get("references_indexed", 0) for r in results)
554
- files_indexed = sum(1 for r in results if not r.get("skipped"))
555
- files_skipped = sum(1 for r in results if r.get("skipped") and not r.get("error"))
556
-
557
- logger.info(
558
- "Indexed %d files (%d skipped) in %.2fs - %d symbols, %d references",
559
- files_indexed, files_skipped, total_elapsed, total_symbols, total_refs
560
- )
600
+ files_newly_indexed = sum(1 for r in results if not r.get("skipped"))
601
+ files_unchanged = sum(1 for r in results if r.get("skipped") and not r.get("error"))
602
+
603
+ if total_files > 0:
604
+ files_per_sec = total_files / total_elapsed if total_elapsed > 0 else 0
605
+ logger.info(
606
+ "Indexed %d files (%d unchanged) in %.2fs (%.1f files/s) - %d symbols, %d references",
607
+ files_newly_indexed, files_unchanged, total_elapsed, files_per_sec, total_symbols, total_refs
608
+ )
609
+ else:
610
+ logger.info(
611
+ "Indexed %d files (%d unchanged) in %.2fs - %d symbols, %d references",
612
+ files_newly_indexed, files_unchanged, total_elapsed, total_symbols, total_refs
613
+ )
561
614
 
562
615
  return results
616
+
617
+
618
+ def _parse_file_for_indexing(filepath: str, db) -> dict | None:
619
+ """Parse a file and extract symbols/references without DB writes.
620
+
621
+ Returns parsed data structure or None if skipped.
622
+ """
623
+ filepath = os.path.abspath(filepath)
624
+ ext = os.path.splitext(filepath)[1].lower()
625
+
626
+ # Check freshness
627
+ mtime = os.path.getmtime(filepath)
628
+ row = db.execute(
629
+ "SELECT id, last_modified FROM files WHERE path = ?", (filepath,)
630
+ ).fetchone()
631
+
632
+ if row and row[1] >= mtime:
633
+ return {"skipped": True, "file_id": row[0]}
634
+
635
+ # Read file
636
+ source_bytes = Path(filepath).read_bytes()
637
+ source_text = source_bytes.decode("utf-8", errors="replace")
638
+
639
+ fhash = db_mod.file_hash(filepath)
640
+
641
+ result = {
642
+ "skipped": False,
643
+ "mtime": mtime,
644
+ "fhash": fhash,
645
+ "symbols": [],
646
+ "references": [],
647
+ "fallback": False,
648
+ }
649
+
650
+ # Try tree-sitter parsing
651
+ lang = _load_language(ext)
652
+
653
+ if lang is not None:
654
+ parser = Parser(lang)
655
+ tree = parser.parse(source_bytes)
656
+
657
+ # Extract symbols (flat list for batch processing)
658
+ raw_symbols = _extract_symbols(tree.root_node, source_bytes)
659
+ all_symbols: list[dict] = []
660
+
661
+ def _collect_symbols(sym_list):
662
+ for sym in sym_list:
663
+ all_symbols.append(sym)
664
+ if sym.get("children"):
665
+ _collect_symbols(sym["children"])
666
+
667
+ _collect_symbols(raw_symbols)
668
+ result["symbols"] = all_symbols
669
+
670
+ # Extract references
671
+ refs = _extract_references(tree.root_node, source_bytes)
672
+ result["references"] = refs
673
+ else:
674
+ # Fallback: entire file as one symbol
675
+ basename = os.path.basename(filepath)
676
+ result["symbols"] = [{
677
+ "name": basename,
678
+ "kind": "file",
679
+ "line_start": 1,
680
+ "line_end": source_text.count("\n") + 1,
681
+ "source_text": source_text[:5000],
682
+ "parent_id": None,
683
+ }]
684
+ result["fallback"] = True
685
+
686
+ return result
687
+
688
+
689
+ def _store_parsed_file(
690
+ filepath: str,
691
+ parsed_data: dict,
692
+ db,
693
+ embedding_batches: list,
694
+ all_embeddings: list,
695
+ start_embed_idx: int
696
+ ) -> dict:
697
+ """Store parsed file data to database with pre-computed embeddings."""
698
+ filepath = os.path.abspath(filepath)
699
+
700
+ # Upsert file record
701
+ file_id = db_mod.upsert_file(db, filepath, parsed_data["mtime"], parsed_data["fhash"])
702
+
703
+ # Delete stale data
704
+ db_mod.delete_file_data(db, file_id)
705
+
706
+ symbols_indexed = 0
707
+ references_indexed = 0
708
+
709
+ # Find embeddings for this file
710
+ file_embeddings = None
711
+ embed_offset = 0
712
+ for bfpath, embed_inputs, _ in embedding_batches:
713
+ if bfpath == filepath:
714
+ file_embeddings = all_embeddings[start_embed_idx + embed_offset:start_embed_idx + embed_offset + len(embed_inputs)]
715
+ break
716
+ embed_offset += len(embed_inputs)
717
+
718
+ # Store symbols with embeddings
719
+ if parsed_data.get("symbols") and file_embeddings:
720
+ with db_mod.transaction(db):
721
+ for i, sym in enumerate(parsed_data["symbols"]):
722
+ sym_id = db_mod.upsert_symbol(
723
+ db, sym["name"], sym["kind"], file_id,
724
+ sym["line_start"], sym["line_end"],
725
+ sym.get("parent_id"), sym["source_text"],
726
+ auto_commit=False
727
+ )
728
+ if i < len(file_embeddings):
729
+ db_mod.upsert_embedding(db, sym_id, file_embeddings[i], auto_commit=False)
730
+ symbols_indexed += 1
731
+
732
+ # Store references
733
+ if parsed_data.get("references"):
734
+ with db_mod.transaction(db):
735
+ for ref in parsed_data["references"]:
736
+ db_mod.upsert_reference(db, ref["name"], file_id, ref["line"], auto_commit=False)
737
+ references_indexed += 1
738
+
739
+ return {
740
+ "file": filepath,
741
+ "symbols_indexed": symbols_indexed,
742
+ "references_indexed": references_indexed,
743
+ "skipped": False,
744
+ }
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "code-memory"
7
- version = "1.0.17"
7
+ version = "1.0.18"
8
8
  description = "A deterministic, high-precision code intelligence MCP server"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -108,7 +108,8 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
108
108
  top_k: Number of results to return.
109
109
 
110
110
  Returns:
111
- A list of result dicts sorted by descending RRF score.
111
+ A list of result dicts sorted by descending RRF score, including
112
+ match_reason, match_highlights, and confidence.
112
113
  """
113
114
  bm25_results = _bm25_search(query, db, top_k=50)
114
115
  vec_results = _vector_search(query, db, top_k=50)
@@ -116,6 +117,7 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
116
117
  # Build RRF score map keyed by symbol_id
117
118
  scores: dict[int, float] = {}
118
119
  details: dict[int, dict] = {}
120
+ match_sources: dict[int, list[str]] = {} # Track which search found each result
119
121
 
120
122
  for rank, r in enumerate(bm25_results, start=1):
121
123
  sid = r["symbol_id"]
@@ -128,6 +130,8 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
128
130
  "line_end": r["line_end"],
129
131
  "source_text": r["source_text"],
130
132
  }
133
+ match_sources[sid] = match_sources.get(sid, [])
134
+ match_sources[sid].append("bm25")
131
135
 
132
136
  for rank, r in enumerate(vec_results, start=1):
133
137
  sid = r["symbol_id"]
@@ -141,14 +145,112 @@ def hybrid_search(query: str, db, top_k: int = 10) -> list[dict]:
141
145
  "line_end": r["line_end"],
142
146
  "source_text": r["source_text"],
143
147
  }
148
+ match_sources[sid] = match_sources.get(sid, [])
149
+ if "vector" not in match_sources[sid]:
150
+ match_sources[sid].append("vector")
144
151
 
145
152
  # Sort by descending RRF score
146
153
  ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_k]
147
154
 
148
- return [
149
- {**details[sid], "score": round(score, 6)}
150
- for sid, score in ranked
151
- ]
155
+ # Build results with match metadata
156
+ results = []
157
+ for sid, score in ranked:
158
+ sources = match_sources.get(sid, [])
159
+ is_hybrid = len(sources) == 2
160
+
161
+ # Determine match reason
162
+ if is_hybrid:
163
+ match_reason = "hybrid (BM25 + semantic)"
164
+ elif "bm25" in sources:
165
+ match_reason = "keyword match (BM25)"
166
+ else:
167
+ match_reason = "semantic match (vector)"
168
+
169
+ # Calculate confidence (normalize RRF score to 0-1 range)
170
+ # Max possible RRF score for a single source is 1/61 ≈ 0.0164
171
+ # For hybrid it's 2/61 ≈ 0.0328. We normalize accordingly.
172
+ max_single_rrf = 1.0 / (_RRF_K + 1) # ≈ 0.0164
173
+ max_hybrid_rrf = 2.0 * max_single_rrf # ≈ 0.0328
174
+ if is_hybrid:
175
+ confidence = min(1.0, score / max_hybrid_rrf)
176
+ else:
177
+ confidence = min(1.0, (score / max_single_rrf) * 0.7) # Cap single-source at 0.7
178
+
179
+ result = {
180
+ **details[sid],
181
+ "score": round(score, 6),
182
+ "match_reason": match_reason,
183
+ "confidence": round(confidence, 3),
184
+ "match_highlights": [], # Will be populated below if BM25 match
185
+ }
186
+
187
+ # Get highlights for BM25 matches using FTS5 highlight function
188
+ if "bm25" in sources:
189
+ highlights = _get_bm25_highlights(query, details[sid]["source_text"], db)
190
+ result["match_highlights"] = highlights
191
+
192
+ results.append(result)
193
+
194
+ return results
195
+
196
+
197
+ def _get_bm25_highlights(query: str, source_text: str, db) -> list[str]:
198
+ """Extract highlighted snippets using FTS5.
199
+
200
+ Returns up to 3 highlighted text snippets showing where the query matched.
201
+ """
202
+ if not source_text or not query:
203
+ return []
204
+
205
+ # Use FTS5 highlight function to get matched portions
206
+ safe_query = query.replace('"', '""')
207
+ try:
208
+ # Create a temporary FTS5 query to get highlights
209
+ # We use the snippet function which returns highlighted fragments
210
+ rows = db.execute(
211
+ """
212
+ SELECT snippet(symbols_fts, 1, '>>>', '<<<', '...', 20) as highlight
213
+ FROM symbols_fts
214
+ WHERE symbols_fts MATCH ?
215
+ LIMIT 3
216
+ """,
217
+ (safe_query,),
218
+ ).fetchall()
219
+
220
+ highlights = []
221
+ for row in rows:
222
+ if row[0] and row[0] not in ("...", ""):
223
+ # Clean up the highlight markers for readability
224
+ highlight = row[0].replace(">>>", "**").replace("<<<", "**")
225
+ if len(highlight) > 10: # Only include meaningful highlights
226
+ highlights.append(highlight)
227
+
228
+ return highlights[:3] # Return at most 3 highlights
229
+ except Exception:
230
+ # Fallback: find query terms in source text
231
+ return _simple_highlights(query, source_text)
232
+
233
+
234
+ def _simple_highlights(query: str, source_text: str) -> list[str]:
235
+ """Simple fallback highlight extraction when FTS5 isn't available."""
236
+ highlights = []
237
+ query_terms = query.lower().split()
238
+ lines = source_text.split("\n")
239
+
240
+ for line in lines[:20]: # Check first 20 lines
241
+ line_lower = line.lower()
242
+ for term in query_terms:
243
+ if term in line_lower and len(line.strip()) > 10:
244
+ # Truncate long lines
245
+ snippet = line.strip()[:100]
246
+ if len(snippet) > 50:
247
+ snippet = snippet[:97] + "..."
248
+ highlights.append(snippet)
249
+ break
250
+ if len(highlights) >= 3:
251
+ break
252
+
253
+ return highlights[:3]
152
254
 
153
255
 
154
256
  # ---------------------------------------------------------------------------
@@ -128,6 +128,45 @@ def check_index_status(directory: str) -> dict:
128
128
  }
129
129
 
130
130
 
131
+ # ── Tool 0.5: get_index_stats ─────────────────────────────────────────────
132
+ @mcp.tool()
133
+ def get_index_stats(directory: str) -> dict:
134
+ """USE THIS TOOL to get comprehensive statistics about the code index.
135
+
136
+ This tool provides detailed metrics about the index health, including
137
+ file counts, symbol distributions, embedding model info, and database size.
138
+
139
+ TRIGGER - Call this tool when:
140
+ - You want to understand what's in the index
141
+ - Debugging search quality issues
142
+ - Checking index freshness or coverage
143
+ - Monitoring database size and health
144
+
145
+ Do NOT use this tool for:
146
+ - Checking if indexing is needed (use check_index_status)
147
+ - Searching for code (use search_code)
148
+
149
+ Args:
150
+ directory: Path to the project directory.
151
+
152
+ Returns:
153
+ Dictionary with:
154
+ - indexed: boolean - true if anything has been indexed
155
+ - counts: Symbol, file, chunk, and embedding counts
156
+ - distributions: Symbol kinds and file extensions
157
+ - freshness: Last indexed timestamps
158
+ - embedding: Model name and dimension
159
+ - database: Size, journal mode, and WAL status
160
+ """
161
+ with logging_config.ToolLogger("get_index_stats", directory=directory):
162
+ try:
163
+ database = db_mod.get_db(directory)
164
+ stats = db_mod.get_index_stats(database, directory)
165
+ return {"status": "ok", **stats}
166
+ except Exception as e:
167
+ return errors.format_error(e)
168
+
169
+
131
170
  # ── Tool 1: search_code ───────────────────────────────────────────────────
132
171
  @mcp.tool()
133
172
  def search_code(
@@ -294,6 +333,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
294
333
  - Enables semantic search via vector embeddings
295
334
  - Builds cross-reference graphs for "find all usages" queries
296
335
  - Incremental indexing: unchanged files are automatically skipped
336
+ - PARALLEL PROCESSING: Uses thread pool for faster indexing
297
337
 
298
338
  Do NOT use this tool for:
299
339
  - Non-code files (images, binaries, data files)
@@ -306,6 +346,8 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
306
346
  Returns:
307
347
  Summary with files_indexed, total_symbols, total_chunks, and details.
308
348
  """
349
+ import time
350
+
309
351
  with logging_config.ToolLogger("index_codebase", directory=directory) as log:
310
352
  try:
311
353
  # Validate directory
@@ -313,20 +355,38 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
313
355
 
314
356
  database = db_mod.get_db(str(directory_path))
315
357
 
358
+ # Track timing for throughput calculation
359
+ start_time = time.perf_counter()
360
+
316
361
  # Report initial progress
317
362
  await ctx.report_progress(0, 100, "Starting indexing...")
318
363
 
319
364
  # Create progress callback that schedules progress updates on the event loop
320
365
  loop = asyncio.get_running_loop()
321
- progress_state = {"current": 0, "total": 0, "phase": "code"}
366
+ progress_state = {"current": 0, "total": 0, "phase": "scanning"}
322
367
 
323
368
  def sync_progress_callback(current: int, total: int, message: str):
324
- """Sync callback that schedules async progress reporting."""
369
+ """Sync callback that schedules async progress reporting with throughput info."""
325
370
  progress_state["current"] = current
326
371
  progress_state["total"] = total
372
+
373
+ # Calculate throughput and ETA
374
+ elapsed = time.perf_counter() - start_time
375
+ if elapsed > 0 and current > 0:
376
+ files_per_sec = current / elapsed
377
+ if files_per_sec > 0 and total > current:
378
+ remaining_files = total - current
379
+ eta_seconds = remaining_files / files_per_sec
380
+ eta_str = f", ETA: {int(eta_seconds)}s" if eta_seconds < 60 else f", ETA: {int(eta_seconds / 60)}m"
381
+ else:
382
+ eta_str = ""
383
+ throughput_str = f" ({files_per_sec:.1f} files/s{eta_str})"
384
+ else:
385
+ throughput_str = ""
386
+
327
387
  # Schedule the async progress report on the event loop
328
388
  asyncio.run_coroutine_threadsafe(
329
- ctx.report_progress(current, total, message),
389
+ ctx.report_progress(current, total, f"{message}{throughput_str}"),
330
390
  loop
331
391
  )
332
392
 
@@ -334,7 +394,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
334
394
  code_logger = logging_config.IndexingLogger("code")
335
395
  code_logger.start(str(directory_path))
336
396
 
337
- await ctx.report_progress(0, 100, "Scanning code files...")
397
+ await ctx.report_progress(0, 100, "Phase 1/3: Scanning code files...")
338
398
 
339
399
  code_results = await asyncio.to_thread(
340
400
  parser_mod.index_directory,
@@ -361,7 +421,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
361
421
  code_file_count = len(code_results)
362
422
  doc_progress_offset = code_file_count
363
423
 
364
- await ctx.report_progress(code_file_count, code_file_count, "Scanning documentation files...")
424
+ await ctx.report_progress(code_file_count, code_file_count, "Phase 2/3: Scanning documentation files...")
365
425
 
366
426
  doc_results = await asyncio.to_thread(
367
427
  doc_parser_mod.index_doc_directory,
@@ -383,7 +443,7 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
383
443
  doc_skipped = [r for r in doc_results if r.get("skipped")]
384
444
 
385
445
  # Extract docstrings from indexed code
386
- await ctx.report_progress(0, 0, "Extracting docstrings...")
446
+ await ctx.report_progress(0, 0, "Phase 3/3: Extracting docstrings...")
387
447
  docstring_results = await asyncio.to_thread(
388
448
  doc_parser_mod.extract_docstrings_from_code,
389
449
  database
@@ -393,20 +453,36 @@ async def index_codebase(directory: str, ctx: Context) -> dict:
393
453
  total_chunks = sum(r.get("chunks_indexed", 0) for r in doc_indexed)
394
454
  log.set_result_count(total_symbols + total_chunks + len(docstring_results))
395
455
 
396
- await ctx.report_progress(100, 100, "Indexing complete!")
456
+ # Calculate final throughput
457
+ total_elapsed = time.perf_counter() - start_time
458
+ total_files = len(code_results) + len(doc_results)
459
+ files_per_sec = total_files / total_elapsed if total_elapsed > 0 else 0
460
+
461
+ await ctx.report_progress(100, 100, f"Indexing complete! ({files_per_sec:.1f} files/s)")
462
+
463
+ # Get total indexed counts from database for cumulative stats
464
+ total_code_files = database.execute("SELECT COUNT(*) FROM files").fetchone()[0]
465
+ total_doc_files = database.execute("SELECT COUNT(*) FROM doc_files").fetchone()[0]
397
466
 
398
467
  return {
399
468
  "status": "ok",
400
469
  "directory": str(directory_path),
470
+ "performance": {
471
+ "total_time_seconds": round(total_elapsed, 2),
472
+ "files_per_second": round(files_per_sec, 1),
473
+ "total_files_processed": total_files,
474
+ },
401
475
  "code": {
402
- "files_indexed": len(indexed),
403
- "files_skipped": len(skipped),
476
+ "files_newly_indexed": len(indexed),
477
+ "files_unchanged": len(skipped),
478
+ "total_indexed_files": total_code_files,
404
479
  "total_symbols": total_symbols,
405
480
  "total_references": sum(r.get("references_indexed", 0) for r in indexed),
406
481
  },
407
482
  "documentation": {
408
- "files_indexed": len(doc_indexed),
409
- "files_skipped": len(doc_skipped),
483
+ "files_newly_indexed": len(doc_indexed),
484
+ "files_unchanged": len(doc_skipped),
485
+ "total_indexed_files": total_doc_files,
410
486
  "total_chunks": total_chunks,
411
487
  "docstrings_extracted": len(docstring_results),
412
488
  },
@@ -133,19 +133,19 @@ class TestToolLogger:
133
133
  class TestIndexingLogger:
134
134
  """Tests for IndexingLogger class."""
135
135
 
136
- def test_tracks_files_processed(self):
137
- """Test that files processed are tracked."""
136
+ def test_tracks_files_newly_indexed(self):
137
+ """Test that files newly indexed are tracked."""
138
138
  idx_logger = logging_config.IndexingLogger("test")
139
139
  idx_logger.file_indexed("file1.py", 3)
140
140
  idx_logger.file_indexed("file2.py", 2)
141
- assert idx_logger.files_processed == 2
141
+ assert idx_logger.files_newly_indexed == 2
142
142
  assert idx_logger.items_indexed == 5
143
143
 
144
- def test_tracks_files_skipped(self):
145
- """Test that files skipped are tracked."""
144
+ def test_tracks_files_unchanged(self):
145
+ """Test that files unchanged are tracked."""
146
146
  idx_logger = logging_config.IndexingLogger("test")
147
147
  idx_logger.file_skipped("file1.py", "unchanged")
148
- assert idx_logger.files_skipped == 1
148
+ assert idx_logger.files_unchanged == 1
149
149
 
150
150
 
151
151
  class TestPreconfiguredLoggers:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes