PyPI - codexa - Versions diffs - 0.4.0__py3-none-any.whl - Mend

codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (189) hide show

codexa-0.4.0.dist-info/METADATA +650 -0
codexa-0.4.0.dist-info/RECORD +189 -0
codexa-0.4.0.dist-info/WHEEL +5 -0
codexa-0.4.0.dist-info/entry_points.txt +2 -0
codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
codexa-0.4.0.dist-info/top_level.txt +1 -0
semantic_code_intelligence/__init__.py +5 -0
semantic_code_intelligence/analysis/__init__.py +21 -0
semantic_code_intelligence/analysis/ai_features.py +351 -0
semantic_code_intelligence/bridge/__init__.py +28 -0
semantic_code_intelligence/bridge/context_provider.py +245 -0
semantic_code_intelligence/bridge/protocol.py +167 -0
semantic_code_intelligence/bridge/server.py +348 -0
semantic_code_intelligence/bridge/vscode.py +271 -0
semantic_code_intelligence/ci/__init__.py +13 -0
semantic_code_intelligence/ci/hooks.py +98 -0
semantic_code_intelligence/ci/hotspots.py +272 -0
semantic_code_intelligence/ci/impact.py +246 -0
semantic_code_intelligence/ci/metrics.py +591 -0
semantic_code_intelligence/ci/pr.py +412 -0
semantic_code_intelligence/ci/quality.py +557 -0
semantic_code_intelligence/ci/templates.py +164 -0
semantic_code_intelligence/ci/trace.py +224 -0
semantic_code_intelligence/cli/__init__.py +0 -0
semantic_code_intelligence/cli/commands/__init__.py +0 -0
semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
semantic_code_intelligence/cli/main.py +65 -0
semantic_code_intelligence/cli/router.py +92 -0
semantic_code_intelligence/config/__init__.py +0 -0
semantic_code_intelligence/config/settings.py +260 -0
semantic_code_intelligence/context/__init__.py +19 -0
semantic_code_intelligence/context/engine.py +429 -0
semantic_code_intelligence/context/memory.py +253 -0
semantic_code_intelligence/daemon/__init__.py +1 -0
semantic_code_intelligence/daemon/watcher.py +515 -0
semantic_code_intelligence/docs/__init__.py +1080 -0
semantic_code_intelligence/embeddings/__init__.py +0 -0
semantic_code_intelligence/embeddings/enhanced.py +131 -0
semantic_code_intelligence/embeddings/generator.py +149 -0
semantic_code_intelligence/embeddings/model_registry.py +100 -0
semantic_code_intelligence/evolution/__init__.py +1 -0
semantic_code_intelligence/evolution/budget_guard.py +111 -0
semantic_code_intelligence/evolution/commit_manager.py +88 -0
semantic_code_intelligence/evolution/context_builder.py +131 -0
semantic_code_intelligence/evolution/engine.py +249 -0
semantic_code_intelligence/evolution/patch_generator.py +229 -0
semantic_code_intelligence/evolution/task_selector.py +214 -0
semantic_code_intelligence/evolution/test_runner.py +111 -0
semantic_code_intelligence/indexing/__init__.py +0 -0
semantic_code_intelligence/indexing/chunker.py +174 -0
semantic_code_intelligence/indexing/parallel.py +86 -0
semantic_code_intelligence/indexing/scanner.py +146 -0
semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
semantic_code_intelligence/llm/__init__.py +62 -0
semantic_code_intelligence/llm/cache.py +219 -0
semantic_code_intelligence/llm/cached_provider.py +145 -0
semantic_code_intelligence/llm/conversation.py +190 -0
semantic_code_intelligence/llm/cross_refactor.py +272 -0
semantic_code_intelligence/llm/investigation.py +274 -0
semantic_code_intelligence/llm/mock_provider.py +77 -0
semantic_code_intelligence/llm/ollama_provider.py +122 -0
semantic_code_intelligence/llm/openai_provider.py +100 -0
semantic_code_intelligence/llm/provider.py +92 -0
semantic_code_intelligence/llm/rate_limiter.py +164 -0
semantic_code_intelligence/llm/reasoning.py +438 -0
semantic_code_intelligence/llm/safety.py +110 -0
semantic_code_intelligence/llm/streaming.py +251 -0
semantic_code_intelligence/lsp/__init__.py +609 -0
semantic_code_intelligence/mcp/__init__.py +393 -0
semantic_code_intelligence/parsing/__init__.py +19 -0
semantic_code_intelligence/parsing/parser.py +375 -0
semantic_code_intelligence/plugins/__init__.py +255 -0
semantic_code_intelligence/plugins/examples/__init__.py +1 -0
semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
semantic_code_intelligence/scalability/__init__.py +205 -0
semantic_code_intelligence/search/__init__.py +0 -0
semantic_code_intelligence/search/formatter.py +123 -0
semantic_code_intelligence/search/grep.py +361 -0
semantic_code_intelligence/search/hybrid_search.py +170 -0
semantic_code_intelligence/search/keyword_search.py +311 -0
semantic_code_intelligence/search/section_expander.py +103 -0
semantic_code_intelligence/services/__init__.py +0 -0
semantic_code_intelligence/services/indexing_service.py +630 -0
semantic_code_intelligence/services/search_service.py +269 -0
semantic_code_intelligence/storage/__init__.py +0 -0
semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
semantic_code_intelligence/storage/hash_store.py +66 -0
semantic_code_intelligence/storage/index_manifest.py +85 -0
semantic_code_intelligence/storage/index_stats.py +138 -0
semantic_code_intelligence/storage/query_history.py +160 -0
semantic_code_intelligence/storage/symbol_registry.py +209 -0
semantic_code_intelligence/storage/vector_store.py +297 -0
semantic_code_intelligence/tests/__init__.py +0 -0
semantic_code_intelligence/tests/test_ai_features.py +351 -0
semantic_code_intelligence/tests/test_chunker.py +119 -0
semantic_code_intelligence/tests/test_cli.py +188 -0
semantic_code_intelligence/tests/test_config.py +154 -0
semantic_code_intelligence/tests/test_context.py +381 -0
semantic_code_intelligence/tests/test_embeddings.py +73 -0
semantic_code_intelligence/tests/test_endtoend.py +1142 -0
semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
semantic_code_intelligence/tests/test_hash_store.py +79 -0
semantic_code_intelligence/tests/test_logging.py +55 -0
semantic_code_intelligence/tests/test_new_cli.py +138 -0
semantic_code_intelligence/tests/test_parser.py +495 -0
semantic_code_intelligence/tests/test_phase10.py +355 -0
semantic_code_intelligence/tests/test_phase11.py +593 -0
semantic_code_intelligence/tests/test_phase12.py +375 -0
semantic_code_intelligence/tests/test_phase13.py +663 -0
semantic_code_intelligence/tests/test_phase14.py +568 -0
semantic_code_intelligence/tests/test_phase15.py +814 -0
semantic_code_intelligence/tests/test_phase16.py +792 -0
semantic_code_intelligence/tests/test_phase17.py +815 -0
semantic_code_intelligence/tests/test_phase18.py +934 -0
semantic_code_intelligence/tests/test_phase19.py +986 -0
semantic_code_intelligence/tests/test_phase20.py +2753 -0
semantic_code_intelligence/tests/test_phase20b.py +2058 -0
semantic_code_intelligence/tests/test_phase20c.py +962 -0
semantic_code_intelligence/tests/test_phase21.py +428 -0
semantic_code_intelligence/tests/test_phase22.py +799 -0
semantic_code_intelligence/tests/test_phase23.py +783 -0
semantic_code_intelligence/tests/test_phase24.py +715 -0
semantic_code_intelligence/tests/test_phase25.py +496 -0
semantic_code_intelligence/tests/test_phase26.py +251 -0
semantic_code_intelligence/tests/test_phase27.py +531 -0
semantic_code_intelligence/tests/test_phase8.py +592 -0
semantic_code_intelligence/tests/test_phase9.py +643 -0
semantic_code_intelligence/tests/test_plugins.py +293 -0
semantic_code_intelligence/tests/test_priority_features.py +727 -0
semantic_code_intelligence/tests/test_router.py +41 -0
semantic_code_intelligence/tests/test_scalability.py +138 -0
semantic_code_intelligence/tests/test_scanner.py +125 -0
semantic_code_intelligence/tests/test_search.py +160 -0
semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
semantic_code_intelligence/tests/test_tools.py +182 -0
semantic_code_intelligence/tests/test_vector_store.py +151 -0
semantic_code_intelligence/tests/test_watcher.py +211 -0
semantic_code_intelligence/tools/__init__.py +442 -0
semantic_code_intelligence/tools/executor.py +232 -0
semantic_code_intelligence/tools/protocol.py +200 -0
semantic_code_intelligence/tui/__init__.py +454 -0
semantic_code_intelligence/utils/__init__.py +0 -0
semantic_code_intelligence/utils/logging.py +112 -0
semantic_code_intelligence/version.py +3 -0
semantic_code_intelligence/web/__init__.py +11 -0
semantic_code_intelligence/web/api.py +289 -0
semantic_code_intelligence/web/server.py +397 -0
semantic_code_intelligence/web/ui.py +659 -0
semantic_code_intelligence/web/visualize.py +226 -0
semantic_code_intelligence/workspace/__init__.py +427 -0

semantic_code_intelligence/storage/query_history.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""Query history — cross-session intelligence and search analytics.
+Records past search queries, their result counts and scores, enabling
+popular-symbol tracking, query suggestions, and analytics on what
+developers search for most.
+"""
+from __future__ import annotations
+import json
+import time
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any
+HISTORY_FILE = "query_history.json"
+MAX_HISTORY = 500
+@dataclass
+class QueryRecord:
+    """A single recorded search query."""
+    query: str
+    timestamp: float = 0.0
+    result_count: int = 0
+    top_score: float = 0.0
+    languages: list[str] = field(default_factory=list)
+    top_files: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> QueryRecord:
+        known = {f.name for f in cls.__dataclass_fields__.values()}  # type: ignore[attr-defined]
+        return cls(**{k: v for k, v in data.items() if k in known})
+class QueryHistory:
+    """Persistent query history with analytics.
+    Stores the last *max_entries* queries and provides aggregate
+    statistics for popular searches, symbols, and files.
+    """
+    def __init__(self, max_entries: int = MAX_HISTORY) -> None:
+        self._records: list[QueryRecord] = []
+        self._max_entries = max_entries
+    # ------------------------------------------------------------------
+    # Mutation
+    # ------------------------------------------------------------------
+    def record(
+        self,
+        query: str,
+        result_count: int = 0,
+        top_score: float = 0.0,
+        languages: list[str] | None = None,
+        top_files: list[str] | None = None,
+    ) -> QueryRecord:
+        """Record a search query."""
+        entry = QueryRecord(
+            query=query,
+            timestamp=time.time(),
+            result_count=result_count,
+            top_score=top_score,
+            languages=languages or [],
+            top_files=top_files or [],
+        )
+        self._records.append(entry)
+        # Evict oldest when exceeding max
+        while len(self._records) > self._max_entries:
+            self._records.pop(0)
+        return entry
+    def clear(self) -> None:
+        """Remove all history."""
+        self._records.clear()
+    # ------------------------------------------------------------------
+    # Queries
+    # ------------------------------------------------------------------
+    @property
+    def size(self) -> int:
+        return len(self._records)
+    @property
+    def records(self) -> list[QueryRecord]:
+        """Return all records (newest last)."""
+        return list(self._records)
+    def recent(self, n: int = 10) -> list[QueryRecord]:
+        """Return the *n* most recent queries."""
+        return list(self._records[-n:])
+    def popular_queries(self, n: int = 10) -> list[tuple[str, int]]:
+        """Return the *n* most frequent query strings with counts."""
+        counts: dict[str, int] = {}
+        for r in self._records:
+            counts[r.query] = counts.get(r.query, 0) + 1
+        ranked = sorted(counts.items(), key=lambda x: x[1], reverse=True)
+        return ranked[:n]
+    def popular_files(self, n: int = 10) -> list[tuple[str, int]]:
+        """Return the *n* most frequently appearing files in results."""
+        counts: dict[str, int] = {}
+        for r in self._records:
+            for f in r.top_files:
+                counts[f] = counts.get(f, 0) + 1
+        ranked = sorted(counts.items(), key=lambda x: x[1], reverse=True)
+        return ranked[:n]
+    def avg_result_count(self) -> float:
+        """Return the average number of results per query."""
+        if not self._records:
+            return 0.0
+        return sum(r.result_count for r in self._records) / len(self._records)
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "max_entries": self._max_entries,
+            "records": [r.to_dict() for r in self._records],
+        }
+    def __repr__(self) -> str:
+        return f"QueryHistory(records={len(self._records)})"
+    # ------------------------------------------------------------------
+    # Persistence
+    # ------------------------------------------------------------------
+    def save(self, directory: str | Path) -> None:
+        """Write history to disk."""
+        path = Path(directory)
+        path.mkdir(parents=True, exist_ok=True)
+        (path / HISTORY_FILE).write_text(
+            json.dumps(self.to_dict(), indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+    @classmethod
+    def load(cls, directory: str | Path) -> QueryHistory:
+        """Load history from disk.  Returns empty history if absent."""
+        history = cls()
+        path = Path(directory) / HISTORY_FILE
+        if not path.exists():
+            return history
+        try:
+            data = json.loads(path.read_text(encoding="utf-8"))
+            if isinstance(data, dict):
+                history._max_entries = data.get("max_entries", MAX_HISTORY)
+                for item in data.get("records", []):
+                    if isinstance(item, dict):
+                        history._records.append(QueryRecord.from_dict(item))
+        except (json.JSONDecodeError, OSError):
+            pass
+        return history

semantic_code_intelligence/storage/symbol_registry.py ADDED Viewed

@@ -0,0 +1,209 @@
+"""Symbol registry — persistent, queryable directory of code symbols.
+Stores every function, class, and method extracted from the codebase,
+enabling fast lookups by name, kind, file, or parent class without
+re-parsing source files.
+"""
+from __future__ import annotations
+import json
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Iterator
+REGISTRY_FILE = "symbol_registry.json"
+@dataclass
+class SymbolEntry:
+    """A single symbol record in the registry."""
+    name: str
+    kind: str  # "function", "class", "method", "import"
+    file_path: str
+    start_line: int
+    end_line: int
+    parent: str | None = None
+    parameters: list[str] = field(default_factory=list)
+    decorators: list[str] = field(default_factory=list)
+    language: str = ""
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> SymbolEntry:
+        known = {f.name for f in cls.__dataclass_fields__.values()}  # type: ignore[attr-defined]
+        filtered = {k: v for k, v in data.items() if k in known}
+        return cls(**filtered)
+    @property
+    def qualified_name(self) -> str:
+        """Return ``Parent.name`` for methods, else just ``name``."""
+        if self.parent:
+            return f"{self.parent}.{self.name}"
+        return self.name
+class SymbolRegistry:
+    """Persistent symbol directory backed by JSON.
+    Supports incremental updates (clear symbols for a file, then re-add),
+    multi-criteria lookups, and disk persistence.
+    """
+    def __init__(self) -> None:
+        self._symbols: list[SymbolEntry] = []
+        # Secondary index: file_path → list of indices into _symbols
+        self._by_file: dict[str, list[int]] = {}
+    # ------------------------------------------------------------------
+    # Mutation
+    # ------------------------------------------------------------------
+    def add(self, entry: SymbolEntry) -> None:
+        """Add a symbol entry to the registry."""
+        idx = len(self._symbols)
+        self._symbols.append(entry)
+        self._by_file.setdefault(entry.file_path, []).append(idx)
+    def add_many(self, entries: list[SymbolEntry]) -> None:
+        """Bulk-add symbol entries."""
+        for entry in entries:
+            self.add(entry)
+    def remove_file(self, file_path: str) -> int:
+        """Remove all symbols belonging to *file_path*.
+        Returns the number of entries removed.
+        """
+        indices = self._by_file.pop(file_path, [])
+        if not indices:
+            return 0
+        removed = len(indices)
+        keep = set(range(len(self._symbols))) - set(indices)
+        self._symbols = [self._symbols[i] for i in sorted(keep)]
+        self._rebuild_file_index()
+        return removed
+    def clear(self) -> None:
+        """Remove all symbols."""
+        self._symbols.clear()
+        self._by_file.clear()
+    # ------------------------------------------------------------------
+    # Queries
+    # ------------------------------------------------------------------
+    @property
+    def size(self) -> int:
+        return len(self._symbols)
+    @property
+    def files(self) -> list[str]:
+        """Return all tracked file paths."""
+        return list(self._by_file.keys())
+    def find_by_name(self, name: str) -> list[SymbolEntry]:
+        """Find all symbols with the exact *name*."""
+        return [s for s in self._symbols if s.name == name]
+    def find_by_kind(self, kind: str) -> list[SymbolEntry]:
+        """Find all symbols of a given *kind* (function, class, method, import)."""
+        return [s for s in self._symbols if s.kind == kind]
+    def find_by_file(self, file_path: str) -> list[SymbolEntry]:
+        """Return all symbols in the given file."""
+        indices = self._by_file.get(file_path, [])
+        return [self._symbols[i] for i in indices]
+    def find(
+        self,
+        name: str | None = None,
+        kind: str | None = None,
+        file_path: str | None = None,
+        parent: str | None = None,
+        language: str | None = None,
+    ) -> list[SymbolEntry]:
+        """Multi-criteria symbol lookup.  ``None`` fields are not filtered."""
+        results: list[SymbolEntry] = []
+        for sym in self._iter_candidates(file_path):
+            if name is not None and sym.name != name:
+                continue
+            if kind is not None and sym.kind != kind:
+                continue
+            if parent is not None and sym.parent != parent:
+                continue
+            if language is not None and sym.language != language:
+                continue
+            results.append(sym)
+        return results
+    def search_name(self, substring: str) -> list[SymbolEntry]:
+        """Return symbols whose name contains *substring* (case-insensitive)."""
+        lower = substring.lower()
+        return [s for s in self._symbols if lower in s.name.lower()]
+    def language_summary(self) -> dict[str, int]:
+        """Return a count of symbols per language."""
+        counts: dict[str, int] = {}
+        for s in self._symbols:
+            lang = s.language or "unknown"
+            counts[lang] = counts.get(lang, 0) + 1
+        return counts
+    def kind_summary(self) -> dict[str, int]:
+        """Return a count of symbols per kind."""
+        counts: dict[str, int] = {}
+        for s in self._symbols:
+            counts[s.kind] = counts.get(s.kind, 0) + 1
+        return counts
+    # ------------------------------------------------------------------
+    # Persistence
+    # ------------------------------------------------------------------
+    def save(self, directory: str | Path) -> None:
+        """Write registry to disk as JSON."""
+        path = Path(directory)
+        path.mkdir(parents=True, exist_ok=True)
+        data = [s.to_dict() for s in self._symbols]
+        (path / REGISTRY_FILE).write_text(
+            json.dumps(data, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+    @classmethod
+    def load(cls, directory: str | Path) -> SymbolRegistry:
+        """Load registry from disk.  Returns empty registry if absent."""
+        registry = cls()
+        path = Path(directory) / REGISTRY_FILE
+        if not path.exists():
+            return registry
+        try:
+            raw = json.loads(path.read_text(encoding="utf-8"))
+            if isinstance(raw, list):
+                for item in raw:
+                    if isinstance(item, dict):
+                        registry.add(SymbolEntry.from_dict(item))
+        except (json.JSONDecodeError, OSError):
+            pass
+        return registry
+    # ------------------------------------------------------------------
+    # Internal
+    # ------------------------------------------------------------------
+    def _rebuild_file_index(self) -> None:
+        self._by_file.clear()
+        for i, sym in enumerate(self._symbols):
+            self._by_file.setdefault(sym.file_path, []).append(i)
+    def _iter_candidates(self, file_path: str | None) -> Iterator[SymbolEntry]:
+        if file_path is not None:
+            indices = self._by_file.get(file_path, [])
+            for i in indices:
+                yield self._symbols[i]
+        else:
+            yield from self._symbols

semantic_code_intelligence/storage/vector_store.py ADDED Viewed

@@ -0,0 +1,297 @@
+"""Vector store — FAISS-based storage and retrieval of code embeddings.
+Supports two index modes:
+- **Flat** (default): Brute-force exact search — best for <50 k vectors.
+- **IVF**: Inverted-file approximate search — faster for large repos (>50 k).
+  Enabled automatically when the vector count crosses *IVF_THRESHOLD* or by
+  passing ``use_ivf=True`` to the constructor.
+"""
+from __future__ import annotations
+import json
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+import faiss
+import numpy as np
+from semantic_code_intelligence.utils.logging import get_logger
+logger = get_logger("storage")
+# If the store has more vectors than this, it can benefit from IVF.
+IVF_THRESHOLD = 50_000
+IVF_NLIST = 100  # number of Voronoi cells
+IVF_NPROBE = 10  # cells probed at search time
+@dataclass
+class ChunkMetadata:
+    """Metadata associated with a stored code chunk."""
+    file_path: str
+    start_line: int
+    end_line: int
+    chunk_index: int
+    language: str
+    content: str
+    content_hash: str = ""
+class VectorStore:
+    """FAISS-backed vector store for code chunk embeddings.
+    Maintains a FAISS index and parallel metadata list.
+    Supports save/load to disk for persistence.
+    When *use_ivf* is ``True`` (or the vector count exceeds *IVF_THRESHOLD*),
+    the store transparently migrates to a ``faiss.IndexIVFFlat`` for faster
+    approximate nearest-neighbour search.
+    """
+    def __init__(self, dimension: int, *, use_ivf: bool = False) -> None:
+        self.dimension = dimension
+        self._use_ivf = use_ivf
+        if use_ivf:
+            quantizer = faiss.IndexFlatIP(dimension)
+            self.index = faiss.IndexIVFFlat(quantizer, dimension, IVF_NLIST, faiss.METRIC_INNER_PRODUCT)
+            self.index.nprobe = IVF_NPROBE
+            self._ivf_trained = False
+        else:
+            self.index = faiss.IndexFlatIP(dimension)
+            self._ivf_trained = True  # flat doesn't need training
+        self.metadata: list[ChunkMetadata] = []
+        # Reverse index: file_path -> set of vector indices for O(1) lookup
+        self._file_index: dict[str, set[int]] = defaultdict(set)
+    @property
+    def size(self) -> int:
+        """Return the number of vectors stored."""
+        return int(self.index.ntotal)
+    def add(
+        self,
+        embeddings: np.ndarray,
+        metadata_list: list[ChunkMetadata],
+    ) -> None:
+        """Add embeddings and their metadata to the store.
+        If the store uses an IVF index that hasn't been trained yet, the first
+        batch of vectors is used to train it.  If the store is in flat mode and
+        the total count crosses *IVF_THRESHOLD*, it auto-upgrades to IVF.
+        """
+        if len(embeddings) != len(metadata_list):
+            raise ValueError(
+                f"Embedding count ({len(embeddings)}) != metadata count ({len(metadata_list)})"
+            )
+        if len(embeddings) == 0:
+            return
+        embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
+        # Train IVF index on first batch if needed
+        if self._use_ivf and not self._ivf_trained:
+            if len(embeddings) >= IVF_NLIST:
+                self.index.train(embeddings)
+                self._ivf_trained = True
+            else:
+                # Not enough vectors to train — fall back to flat temporarily
+                logger.debug("Not enough vectors to train IVF (%d < %d), using flat.", len(embeddings), IVF_NLIST)
+                self.index = faiss.IndexFlatIP(self.dimension)
+                self._use_ivf = False
+                self._ivf_trained = True
+        # Update file index before adding
+        base = len(self.metadata)
+        for i, meta in enumerate(metadata_list):
+            self._file_index[meta.file_path].add(base + i)
+        self.index.add(embeddings)
+        self.metadata.extend(metadata_list)
+        # Auto-upgrade from flat to IVF when threshold is crossed
+        if not self._use_ivf and self.size >= IVF_THRESHOLD:
+            self._upgrade_to_ivf()
+    def search(
+        self,
+        query_embedding: np.ndarray,
+        top_k: int = 10,
+    ) -> list[tuple[ChunkMetadata, float]]:
+        """Search for the most similar embeddings.
+        Args:
+            query_embedding: Query vector of shape (dimension,) or (1, dimension).
+            top_k: Number of top results to return.
+        Returns:
+            List of (metadata, score) tuples, ordered by decreasing similarity.
+        """
+        if self.size == 0:
+            return []
+        query = np.ascontiguousarray(
+            query_embedding.reshape(1, -1), dtype=np.float32
+        )
+        k = min(top_k, self.size)
+        scores, indices = self.index.search(query, k)
+        results: list[tuple[ChunkMetadata, float]] = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < 0:
+                continue
+            results.append((self.metadata[idx], float(score)))
+        return results
+    def save(self, directory: Path) -> None:
+        """Persist the vector store to disk.
+        Saves the FAISS index and metadata as separate files.
+        Args:
+            directory: Directory to save into.
+        """
+        directory = Path(directory)
+        directory.mkdir(parents=True, exist_ok=True)
+        index_path = directory / "vectors.faiss"
+        meta_path = directory / "metadata.json"
+        faiss.write_index(self.index, str(index_path))
+        meta_dicts = [asdict(m) for m in self.metadata]
+        meta_path.write_text(
+            json.dumps(meta_dicts, ensure_ascii=False),
+            encoding="utf-8",
+        )
+        logger.info("Saved %d vectors to %s", self.size, directory)
+    @classmethod
+    def load(cls, directory: Path) -> "VectorStore":
+        """Load a vector store from disk.
+        Args:
+            directory: Directory containing vectors.faiss and metadata.json.
+        Returns:
+            A populated VectorStore instance.
+        Raises:
+            FileNotFoundError: If the required files don't exist.
+        """
+        directory = Path(directory)
+        index_path = directory / "vectors.faiss"
+        meta_path = directory / "metadata.json"
+        if not index_path.exists() or not meta_path.exists():
+            raise FileNotFoundError(f"No vector store found in {directory}")
+        index = faiss.read_index(str(index_path))
+        dimension = index.d
+        meta_dicts = json.loads(meta_path.read_text(encoding="utf-8"))
+        metadata = [ChunkMetadata(**m) for m in meta_dicts]
+        store = cls(dimension)
+        store.index = index
+        store.metadata = metadata
+        # Rebuild file index from loaded metadata
+        for i, m in enumerate(metadata):
+            store._file_index[m.file_path].add(i)
+        logger.info("Loaded %d vectors from %s", store.size, directory)
+        return store
+    def remove_by_file(self, file_path: str) -> int:
+        """Remove all vectors whose metadata references *file_path*.
+        Uses the file index for O(1) lookup and batch vector reconstruction.
+        Args:
+            file_path: The ``file_path`` field to match against.
+        Returns:
+            Number of vectors removed.
+        """
+        remove_set = self._file_index.get(file_path)
+        if not remove_set:
+            return 0
+        removed = len(remove_set)
+        keep_indices = [
+            i for i in range(len(self.metadata)) if i not in remove_set
+        ]
+        if keep_indices:
+            # Batch reconstruct all kept vectors at once (no Python loop)
+            kept_vectors = np.empty(
+                (len(keep_indices), self.dimension), dtype=np.float32,
+            )
+            for j, idx in enumerate(keep_indices):
+                self.index.reconstruct(idx, kept_vectors[j])
+            kept_meta = [self.metadata[i] for i in keep_indices]
+        else:
+            kept_vectors = np.empty((0, self.dimension), dtype=np.float32)
+            kept_meta = []
+        self.index.reset()
+        if len(kept_vectors) > 0:
+            self.index.add(np.ascontiguousarray(kept_vectors))
+        self.metadata = kept_meta
+        # Rebuild file index
+        self._file_index.clear()
+        for i, m in enumerate(self.metadata):
+            self._file_index[m.file_path].add(i)
+        logger.debug("Removed %d vectors for %s", removed, file_path)
+        return removed
+    def get_vectors_for_file(self, file_path: str) -> list[tuple[ChunkMetadata, np.ndarray]]:
+        """Return metadata and vectors for all chunks belonging to a file.
+        Used by incremental indexing to preserve vectors for unchanged chunks
+        before removing the file's entries from the store.
+        Returns:
+            List of (metadata, vector) pairs.
+        """
+        indices = self._file_index.get(file_path)
+        if not indices:
+            return []
+        result: list[tuple[ChunkMetadata, np.ndarray]] = []
+        for idx in sorted(indices):
+            vec = np.empty(self.dimension, dtype=np.float32)
+            self.index.reconstruct(idx, vec)
+            result.append((self.metadata[idx], vec))
+        return result
+    def clear(self) -> None:
+        """Remove all vectors and metadata."""
+        self.index.reset()
+        self.metadata.clear()
+        self._file_index.clear()
+    # ------------------------------------------------------------------
+    # IVF helpers
+    # ------------------------------------------------------------------
+    def _upgrade_to_ivf(self) -> None:
+        """Migrate an in-memory flat index to IVF for faster search."""
+        n = self.size
+        if n < IVF_NLIST:
+            return  # not enough vectors
+        logger.info("Auto-upgrading index to IVF (%d vectors).", n)
+        all_vecs = np.vstack(
+            [self.index.reconstruct(i).reshape(1, -1) for i in range(n)]
+        ).astype(np.float32)
+        quantizer = faiss.IndexFlatIP(self.dimension)
+        ivf = faiss.IndexIVFFlat(quantizer, self.dimension, IVF_NLIST, faiss.METRIC_INNER_PRODUCT)
+        ivf.nprobe = IVF_NPROBE
+        ivf.train(all_vecs)
+        ivf.add(all_vecs)
+        self.index = ivf
+        self._use_ivf = True
+        self._ivf_trained = True

semantic_code_intelligence/tests/__init__.py ADDED Viewed

File without changes