PyPI - codegraph-cli - Versions diffs - 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl - Mend

codegraph-cli 2.1.0py3-none-any.whl → 2.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

codegraph_cli/__init__.py +1 -1
codegraph_cli/agents.py +59 -3
codegraph_cli/chat_agent.py +58 -11
codegraph_cli/cli.py +569 -54
codegraph_cli/cli_chat.py +204 -94
codegraph_cli/cli_diagnose.py +13 -2
codegraph_cli/cli_docs.py +207 -0
codegraph_cli/cli_explore.py +1053 -0
codegraph_cli/cli_export.py +941 -0
codegraph_cli/cli_groups.py +33 -0
codegraph_cli/cli_health.py +316 -0
codegraph_cli/cli_history.py +213 -0
codegraph_cli/cli_onboard.py +380 -0
codegraph_cli/cli_quickstart.py +256 -0
codegraph_cli/cli_refactor.py +17 -3
codegraph_cli/cli_setup.py +12 -12
codegraph_cli/cli_suggestions.py +90 -0
codegraph_cli/cli_test.py +17 -3
codegraph_cli/cli_tui.py +210 -0
codegraph_cli/cli_v2.py +24 -4
codegraph_cli/cli_watch.py +158 -0
codegraph_cli/cli_workflows.py +255 -0
codegraph_cli/codegen_agent.py +15 -1
codegraph_cli/config.py +18 -5
codegraph_cli/context_manager.py +117 -15
codegraph_cli/crew_agents.py +32 -8
codegraph_cli/crew_chat.py +146 -13
codegraph_cli/crew_tools.py +30 -2
codegraph_cli/embeddings.py +95 -5
codegraph_cli/llm.py +42 -55
codegraph_cli/project_context.py +64 -1
codegraph_cli/rag.py +282 -19
codegraph_cli/storage.py +310 -14
codegraph_cli/vector_store.py +110 -8
{codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/METADATA +75 -21
codegraph_cli-2.1.2.dist-info/RECORD +55 -0
codegraph_cli-2.1.2.dist-info/entry_points.txt +2 -0
codegraph_cli-2.1.0.dist-info/RECORD +0 -43
codegraph_cli-2.1.0.dist-info/entry_points.txt +0 -2
{codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/WHEEL +0 -0
{codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/licenses/LICENSE +0 -0
{codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/top_level.txt +0 -0

codegraph_cli/storage.py CHANGED Viewed

@@ -13,6 +13,7 @@ from __future__ import annotations
 import json
 import logging
+import re
 import sqlite3
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple
@@ -109,7 +110,7 @@ class GraphStore:
         self.conn.row_factory = sqlite3.Row
         self._init_schema()
-        # Initialise LanceDB vector store
+        # Initialise LanceDB vector store (default / legacy table)
         self.vector_store: Optional[VectorStore] = None
         if VECTOR_STORE_AVAILABLE:
             try:
@@ -117,6 +118,9 @@ class GraphStore:
             except Exception as exc:
                 logger.warning("LanceDB vector store unavailable: %s", exc)
+        # Per-model vector store cache: model_key → VectorStore
+        self._model_vector_stores: Dict[str, "VectorStore"] = {}
     def close(self) -> None:
         self.conn.close()
@@ -186,12 +190,19 @@ class GraphStore:
     # Insert
     # ------------------------------------------------------------------
-    def insert_nodes(self, rows: Iterable[Tuple[Node, List[float]]]) -> None:
+    def insert_nodes(
+        self,
+        rows: Iterable[Tuple[Node, List[float]]],
+        model_key: Optional[str] = None,
+    ) -> None:
         """Insert nodes with their embedding vectors.
         Each element of *rows* is a ``(Node, embedding)`` tuple.  Data is
         written to both SQLite (for structured queries) and LanceDB (for
         vector search).
+        When *model_key* is provided the embeddings are also written to
+        the model-specific LanceDB table (``code_nodes_{model_key}``).
         """
         rows_list = list(rows)
         if not rows_list:
@@ -225,26 +236,42 @@ class GraphStore:
         self.conn.commit()
         # ---- LanceDB (vector store) ------------------------------------
+        node_ids = [node.node_id for node, _ in rows_list]
+        embeddings = [emb for _, emb in rows_list]
+        metadatas = [
+            {
+                "node_type": node.node_type,
+                "file_path": node.file_path,
+                "qualname": node.qualname,
+                "name": node.name,
+            }
+            for node, _ in rows_list
+        ]
+        documents = [node.code for node, _ in rows_list]
+        # Write to legacy table (backward compat)
         if self.vector_store is not None:
             try:
-                node_ids = [node.node_id for node, _ in rows_list]
-                embeddings = [emb for _, emb in rows_list]
-                metadatas = [
-                    {
-                        "node_type": node.node_type,
-                        "file_path": node.file_path,
-                        "qualname": node.qualname,
-                        "name": node.name,
-                    }
-                    for node, _ in rows_list
-                ]
-                documents = [node.code for node, _ in rows_list]
                 self.vector_store.add_nodes(
                     node_ids, embeddings, metadatas, documents,
                 )
             except Exception as exc:
                 logger.warning("Failed to sync nodes to LanceDB: %s", exc)
+        # Write to model-specific table
+        if model_key:
+            model_vs = self.get_vector_store_for_model(model_key)
+            if model_vs is not None:
+                try:
+                    model_vs.add_nodes(
+                        node_ids, embeddings, metadatas, documents,
+                    )
+                except Exception as exc:
+                    logger.warning(
+                        "Failed to sync nodes to model table '%s': %s",
+                        model_key, exc,
+                    )
     def insert_edges(self, edges: Iterable[Edge]) -> None:
         cur = self.conn.cursor()
         cur.executemany(
@@ -253,6 +280,132 @@ class GraphStore:
         )
         self.conn.commit()
+    # ------------------------------------------------------------------
+    # Incremental index (single-file add / remove)
+    # ------------------------------------------------------------------
+    def remove_nodes_for_file(self, rel_path: str) -> int:
+        """Remove all nodes and related edges for a specific file.
+        Clears data from SQLite **and** every known LanceDB table
+        (legacy + per-model).
+        Args:
+            rel_path: Relative file path as stored in the ``file_path``
+                      column (e.g. ``"src/utils.py"``).
+        Returns:
+            Number of SQLite node rows deleted.
+        """
+        # 1. Collect node IDs that belong to this file
+        cur = self.conn.cursor()
+        rows = cur.execute(
+            "SELECT node_id FROM nodes WHERE file_path = ?", (rel_path,),
+        ).fetchall()
+        node_ids = [r[0] for r in rows]
+        if not node_ids:
+            return 0
+        # 2. Delete edges referencing these nodes (src OR dst)
+        placeholders = ",".join("?" * len(node_ids))
+        cur.execute(
+            f"DELETE FROM edges WHERE src IN ({placeholders}) OR dst IN ({placeholders})",
+            node_ids + node_ids,
+        )
+        # 3. Delete nodes themselves
+        cur.execute(
+            f"DELETE FROM nodes WHERE node_id IN ({placeholders})",
+            node_ids,
+        )
+        self.conn.commit()
+        # 4. Remove from legacy LanceDB table
+        if self.vector_store is not None:
+            try:
+                self.vector_store.delete_by_file_path(rel_path)
+            except Exception as exc:
+                logger.debug("Legacy vector delete for '%s': %s", rel_path, exc)
+        # 5. Remove from all per-model LanceDB tables
+        for _key, vs in self._model_vector_stores.items():
+            try:
+                vs.delete_by_file_path(rel_path)
+            except Exception:
+                pass
+        # Also try tables that haven't been opened yet
+        if VECTOR_STORE_AVAILABLE:
+            try:
+                probe = VectorStore(self.project_dir, model_key="")
+                for mk in probe.list_model_tables():
+                    if mk and mk not in self._model_vector_stores:
+                        try:
+                            vs = VectorStore(self.project_dir, model_key=mk)
+                            vs.delete_by_file_path(rel_path)
+                        except Exception:
+                            pass
+            except Exception:
+                pass
+        return len(node_ids)
+    def index_single_file(
+        self,
+        file_path: Path,
+        project_root: Path,
+        embedder: Any,
+        model_key: str = "",
+    ) -> int:
+        """Parse and index a single file incrementally.
+        Removes old nodes/edges for the file, parses it fresh,
+        embeds the new nodes, and inserts them.
+        Args:
+            file_path:    Absolute path to the source file.
+            project_root: Project root (for computing relative paths).
+            embedder:     Object with ``embed_text(str) -> List[float]``.
+            model_key:    Embedding model identifier.
+        Returns:
+            Number of nodes indexed for this file.
+        """
+        from .parser import PythonGraphParser
+        from .agents import _build_chunk_text
+        rel_path = str(file_path.relative_to(project_root))
+        # Remove stale data for this file
+        self.remove_nodes_for_file(rel_path)
+        # Parse the single file
+        parser = PythonGraphParser(project_root)
+        try:
+            nodes, edges = parser.parse_file(file_path)
+        except Exception as exc:
+            logger.warning("Failed to parse %s: %s", file_path, exc)
+            return 0
+        if not nodes:
+            return 0
+        # Embed and insert
+        node_payload = []
+        for node in nodes:
+            text = _build_chunk_text(node)
+            emb = embedder.embed_text(text)
+            node_payload.append((node, emb))
+        self.insert_nodes(node_payload, model_key=model_key)
+        self.insert_edges(edges)
+        logger.info(
+            "Incremental index: %d nodes, %d edges for %s",
+            len(nodes), len(edges), rel_path,
+        )
+        return len(nodes)
     # ------------------------------------------------------------------
     # Read (structured)
     # ------------------------------------------------------------------
@@ -388,3 +541,146 @@ class GraphStore:
             edge_rows,
         )
         self.conn.commit()
+    # ------------------------------------------------------------------
+    # Per-model vector stores  (auto re-ingestion)
+    # ------------------------------------------------------------------
+    def get_vector_store_for_model(self, model_key: str) -> Optional["VectorStore"]:
+        """Get (or create) a LanceDB vector store for a specific embedding model.
+        Each embedding model gets its own LanceDB table so that
+        different dimensionalities never collide.  The table is named
+        ``code_nodes_{model_key}``.
+        Returns ``None`` when LanceDB is not available.
+        """
+        if not VECTOR_STORE_AVAILABLE:
+            return None
+        if model_key in self._model_vector_stores:
+            return self._model_vector_stores[model_key]
+        try:
+            vs = VectorStore(self.project_dir, model_key=model_key)
+            self._model_vector_stores[model_key] = vs
+            return vs
+        except Exception as exc:
+            logger.warning(
+                "Cannot create vector store for model '%s': %s", model_key, exc,
+            )
+            return None
+    def reingest_for_model(
+        self,
+        model_key: str,
+        embedder: Any,
+        chunk_builder: Any = None,
+    ) -> int:
+        """Re-embed all SQLite nodes into a model-specific LanceDB table.
+        Reads raw code/metadata from the SQLite ``nodes`` table,
+        computes embeddings with *embedder*, and writes them into the
+        LanceDB table for *model_key*.
+        Args:
+            model_key:     Embedding model identifier (e.g. ``"minilm"``).
+            embedder:      Object with an ``embed_text(str) -> List[float]``
+                           method (and optionally ``embed_documents``).
+            chunk_builder: Optional callable ``(dict) -> str`` that builds
+                           the text chunk from a node row dict.  Falls back
+                           to an internal default.
+        Returns:
+            Number of nodes ingested.
+        """
+        vs = self.get_vector_store_for_model(model_key)
+        if vs is None:
+            return 0
+        rows = self.get_nodes()
+        if not rows:
+            return 0
+        if chunk_builder is None:
+            chunk_builder = _default_chunk_builder
+        # Clear old data for this model's table and re-open
+        vs.clear()
+        self._model_vector_stores.pop(model_key, None)
+        vs = self.get_vector_store_for_model(model_key)
+        if vs is None:
+            return 0
+        node_ids: List[str] = []
+        embeddings: List[List[float]] = []
+        metadatas: List[Dict[str, str]] = []
+        documents: List[str] = []
+        texts: List[str] = []
+        for row in rows:
+            row_dict = dict(row)
+            text = chunk_builder(row_dict)
+            texts.append(text)
+            node_ids.append(row_dict["node_id"])
+            metadatas.append({
+                "node_type": row_dict["node_type"],
+                "file_path": row_dict["file_path"],
+                "qualname": row_dict["qualname"],
+                "name": row_dict["name"],
+            })
+            documents.append(row_dict["code"])
+        # Batch-embed when possible, single-embed otherwise
+        if hasattr(embedder, "embed_documents"):
+            embeddings = embedder.embed_documents(texts)
+        else:
+            embeddings = [embedder.embed_text(t) for t in texts]
+        try:
+            vs.add_nodes(node_ids, embeddings, metadatas, documents)
+            logger.info(
+                "Re-ingested %d nodes into table for embedding model '%s'.",
+                len(node_ids), model_key,
+            )
+        except Exception as exc:
+            logger.warning("Re-ingestion for model '%s' failed: %s", model_key, exc)
+            return 0
+        return len(node_ids)
+# ===================================================================
+# Helpers
+# ===================================================================
+# Regex to strip bare import lines from chunk text (mirrors agents._IMPORT_RE)
+_CHUNK_IMPORT_RE = re.compile(r"^(?:from\s+\S+\s+)?import\s+.+$", re.MULTILINE)
+_MAX_CHUNK_CODE = 1500
+def _default_chunk_builder(row: Dict[str, Any]) -> str:
+    """Build embedding text from a SQLite node row dict.
+    Mirrors :func:`codegraph_cli.agents._build_chunk_text` but works
+    with plain dicts instead of :class:`Node` objects.
+    """
+    parts: List[str] = [
+        f"file: {row['file_path']}",
+        f"symbol: {row['qualname']}",
+        f"type: {row['node_type']}",
+    ]
+    docstring = row.get("docstring") or ""
+    if docstring.strip():
+        parts.append(f"doc: {docstring.strip()}")
+    code: str = row.get("code", "")
+    if row["node_type"] != "module":
+        code = _CHUNK_IMPORT_RE.sub("", code).strip()
+    else:
+        code = code[:_MAX_CHUNK_CODE]
+    if len(code) > _MAX_CHUNK_CODE:
+        code = code[:_MAX_CHUNK_CODE] + "\n# ... (truncated)"
+    if code:
+        parts.append(code)
+    return "\n".join(parts)

codegraph_cli/vector_store.py CHANGED Viewed

@@ -49,22 +49,26 @@ class VectorStore:
     ======== ============ =====================================
     """
-    def __init__(self, project_dir: Path) -> None:
+    def __init__(self, project_dir: Path, model_key: str = "") -> None:
         if not LANCE_AVAILABLE:
             raise ImportError(
                 "lancedb is not installed. Install with: pip install lancedb pyarrow"
             )
         self.project_dir = project_dir
+        self.model_key = model_key
         self._lance_dir = project_dir / "lancedb"
         self._lance_dir.mkdir(exist_ok=True, parents=True)
+        # Each embedding model gets its own table to avoid dimension conflicts
+        self._table_name = f"code_nodes_{model_key}" if model_key else "code_nodes"
         self._db: Any = lancedb.connect(str(self._lance_dir))
         self._table: Optional[Any] = None
         # Try to open existing table
         try:
-            self._table = self._db.open_table("code_nodes")
+            self._table = self._db.open_table(self._table_name)
         except Exception:
             self._table = None
@@ -106,7 +110,7 @@ class VectorStore:
         if self._table is None:
             # First insert – create the table (schema inferred from data)
             self._table = self._db.create_table(
-                "code_nodes", data=rows, mode="overwrite",
+                self._table_name, data=rows, mode="overwrite",
             )
         else:
             # Subsequent inserts – upsert by deleting old IDs first
@@ -150,7 +154,12 @@ class VectorStore:
             return empty
         try:
-            query = self._table.search(query_embedding).limit(n_results)
+            query = (
+                self._table
+                .search(query_embedding)
+                .metric("cosine")
+                .limit(n_results)
+            )
             # Apply metadata filters as SQL WHERE clause
             if where:
@@ -171,8 +180,11 @@ class VectorStore:
         docs: List[str] = []
         for row in results:
+            # With cosine metric, _distance is the *cosine distance*
+            # (1 − cos_sim), so values are in [0, 2].
+            dist = row.get("_distance", 0.0)
             ids.append(row.get("id", ""))
-            distances.append(row.get("_distance", 0.0))
+            distances.append(dist)
             metas.append({
                 "node_type": row.get("node_type", ""),
                 "file_path": row.get("file_path", ""),
@@ -209,7 +221,12 @@ class VectorStore:
             return []
         try:
-            query = self._table.search(query_embedding).limit(n_results)
+            query = (
+                self._table
+                .search(query_embedding)
+                .metric("cosine")
+                .limit(n_results)
+            )
             if where_sql:
                 query = query.where(where_sql)
             return query.to_list()
@@ -226,7 +243,7 @@ class VectorStore:
         if self._table is None:
             return None
         try:
-            import pandas as pd  # type: ignore[import-untyped]
+            import pandas as pd  # type: ignore[import-untyped]  # noqa: F811
             df: pd.DataFrame = self._table.to_pandas()
             match = df[df["id"] == node_id]
             if match.empty:
@@ -260,14 +277,58 @@ class VectorStore:
             except Exception:
                 pass
+    def delete_by_file_path(self, file_path: str) -> int:
+        """Delete all nodes belonging to a specific file.
+        Args:
+            file_path: Relative file path (must match the ``file_path``
+                       column stored during indexing).
+        Returns:
+            Number of rows deleted (0 if table is empty / missing).
+        """
+        if self._table is None:
+            return 0
+        try:
+            before = self._table.count_rows()
+            # Escape single quotes in the path to avoid SQL injection
+            safe_path = file_path.replace("'", "''")
+            self._table.delete(f"file_path = '{safe_path}'")
+            after = self._table.count_rows()
+            return max(0, before - after)
+        except Exception as exc:
+            logger.warning(
+                "delete_by_file_path('%s') failed: %s", file_path, exc,
+            )
+            return 0
     def clear(self) -> None:
         """Drop all data and recreate an empty table."""
         try:
-            self._db.drop_table("code_nodes")
+            self._db.drop_table(self._table_name)
         except Exception:
             pass
         self._table = None
+    def list_model_tables(self) -> List[str]:
+        """Return model keys for which a LanceDB table exists.
+        Tables are named ``code_nodes_{model_key}``; this method strips
+        the prefix and returns just the model keys.
+        """
+        try:
+            all_tables = self._db.table_names()
+        except Exception:
+            return []
+        models: List[str] = []
+        prefix = "code_nodes_"
+        for name in all_tables:
+            if name == "code_nodes":
+                models.append("")  # legacy table
+            elif name.startswith(prefix):
+                models.append(name[len(prefix):])
+        return models
     # ------------------------------------------------------------------
     # Informational
     # ------------------------------------------------------------------
@@ -291,3 +352,44 @@ class VectorStore:
             return df.head(limit).to_dict(orient="records")
         except Exception:
             return []
+    def debug_search(
+        self,
+        query_embedding: List[float],
+        n_results: int = 5,
+    ) -> List[Dict[str, Any]]:
+        """Diagnostic search returning raw scores and distance details.
+        Unlike :meth:`search`, this returns a flat list of dicts that
+        includes the raw ``_distance`` value, the derived similarity
+        score, and key metadata — useful for inspecting retrieval
+        quality from the CLI.
+        """
+        if self._table is None:
+            return []
+        try:
+            results = (
+                self._table
+                .search(query_embedding)
+                .metric("cosine")
+                .limit(n_results)
+                .to_list()
+            )
+        except Exception as exc:
+            logger.warning("debug_search failed: %s", exc)
+            return []
+        out: List[Dict[str, Any]] = []
+        for row in results:
+            dist = row.get("_distance", 0.0)
+            out.append({
+                "id": row.get("id", ""),
+                "name": row.get("name", ""),
+                "qualname": row.get("qualname", ""),
+                "node_type": row.get("node_type", ""),
+                "file_path": row.get("file_path", ""),
+                "cosine_distance": round(dist, 5),
+                "similarity_score": round(max(0.0, 1.0 - dist), 5),
+                "document_preview": (row.get("document", "") or "")[:120],
+            })
+        return out

codegraph-cli 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl

codegraph-cli 2.1.0py3-none-any.whl → 2.1.2py3-none-any.whl