codegraph-cli 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. codegraph_cli/__init__.py +1 -1
  2. codegraph_cli/agents.py +59 -3
  3. codegraph_cli/chat_agent.py +58 -11
  4. codegraph_cli/cli.py +569 -54
  5. codegraph_cli/cli_chat.py +204 -94
  6. codegraph_cli/cli_diagnose.py +13 -2
  7. codegraph_cli/cli_docs.py +207 -0
  8. codegraph_cli/cli_explore.py +1053 -0
  9. codegraph_cli/cli_export.py +941 -0
  10. codegraph_cli/cli_groups.py +33 -0
  11. codegraph_cli/cli_health.py +316 -0
  12. codegraph_cli/cli_history.py +213 -0
  13. codegraph_cli/cli_onboard.py +380 -0
  14. codegraph_cli/cli_quickstart.py +256 -0
  15. codegraph_cli/cli_refactor.py +17 -3
  16. codegraph_cli/cli_setup.py +12 -12
  17. codegraph_cli/cli_suggestions.py +90 -0
  18. codegraph_cli/cli_test.py +17 -3
  19. codegraph_cli/cli_tui.py +210 -0
  20. codegraph_cli/cli_v2.py +24 -4
  21. codegraph_cli/cli_watch.py +158 -0
  22. codegraph_cli/cli_workflows.py +255 -0
  23. codegraph_cli/codegen_agent.py +15 -1
  24. codegraph_cli/config.py +18 -5
  25. codegraph_cli/context_manager.py +117 -15
  26. codegraph_cli/crew_agents.py +32 -8
  27. codegraph_cli/crew_chat.py +146 -13
  28. codegraph_cli/crew_tools.py +30 -2
  29. codegraph_cli/embeddings.py +95 -5
  30. codegraph_cli/llm.py +42 -55
  31. codegraph_cli/project_context.py +64 -1
  32. codegraph_cli/rag.py +282 -19
  33. codegraph_cli/storage.py +310 -14
  34. codegraph_cli/vector_store.py +110 -8
  35. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/METADATA +75 -21
  36. codegraph_cli-2.1.2.dist-info/RECORD +55 -0
  37. codegraph_cli-2.1.2.dist-info/entry_points.txt +2 -0
  38. codegraph_cli-2.1.0.dist-info/RECORD +0 -43
  39. codegraph_cli-2.1.0.dist-info/entry_points.txt +0 -2
  40. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/WHEEL +0 -0
  41. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/licenses/LICENSE +0 -0
  42. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/top_level.txt +0 -0
codegraph_cli/storage.py CHANGED
@@ -13,6 +13,7 @@ from __future__ import annotations
13
13
 
14
14
  import json
15
15
  import logging
16
+ import re
16
17
  import sqlite3
17
18
  from pathlib import Path
18
19
  from typing import Any, Dict, Iterable, List, Optional, Tuple
@@ -109,7 +110,7 @@ class GraphStore:
109
110
  self.conn.row_factory = sqlite3.Row
110
111
  self._init_schema()
111
112
 
112
- # Initialise LanceDB vector store
113
+ # Initialise LanceDB vector store (default / legacy table)
113
114
  self.vector_store: Optional[VectorStore] = None
114
115
  if VECTOR_STORE_AVAILABLE:
115
116
  try:
@@ -117,6 +118,9 @@ class GraphStore:
117
118
  except Exception as exc:
118
119
  logger.warning("LanceDB vector store unavailable: %s", exc)
119
120
 
121
+ # Per-model vector store cache: model_key → VectorStore
122
+ self._model_vector_stores: Dict[str, "VectorStore"] = {}
123
+
120
124
  def close(self) -> None:
121
125
  self.conn.close()
122
126
 
@@ -186,12 +190,19 @@ class GraphStore:
186
190
  # Insert
187
191
  # ------------------------------------------------------------------
188
192
 
189
- def insert_nodes(self, rows: Iterable[Tuple[Node, List[float]]]) -> None:
193
+ def insert_nodes(
194
+ self,
195
+ rows: Iterable[Tuple[Node, List[float]]],
196
+ model_key: Optional[str] = None,
197
+ ) -> None:
190
198
  """Insert nodes with their embedding vectors.
191
199
 
192
200
  Each element of *rows* is a ``(Node, embedding)`` tuple. Data is
193
201
  written to both SQLite (for structured queries) and LanceDB (for
194
202
  vector search).
203
+
204
+ When *model_key* is provided the embeddings are also written to
205
+ the model-specific LanceDB table (``code_nodes_{model_key}``).
195
206
  """
196
207
  rows_list = list(rows)
197
208
  if not rows_list:
@@ -225,26 +236,42 @@ class GraphStore:
225
236
  self.conn.commit()
226
237
 
227
238
  # ---- LanceDB (vector store) ------------------------------------
239
+ node_ids = [node.node_id for node, _ in rows_list]
240
+ embeddings = [emb for _, emb in rows_list]
241
+ metadatas = [
242
+ {
243
+ "node_type": node.node_type,
244
+ "file_path": node.file_path,
245
+ "qualname": node.qualname,
246
+ "name": node.name,
247
+ }
248
+ for node, _ in rows_list
249
+ ]
250
+ documents = [node.code for node, _ in rows_list]
251
+
252
+ # Write to legacy table (backward compat)
228
253
  if self.vector_store is not None:
229
254
  try:
230
- node_ids = [node.node_id for node, _ in rows_list]
231
- embeddings = [emb for _, emb in rows_list]
232
- metadatas = [
233
- {
234
- "node_type": node.node_type,
235
- "file_path": node.file_path,
236
- "qualname": node.qualname,
237
- "name": node.name,
238
- }
239
- for node, _ in rows_list
240
- ]
241
- documents = [node.code for node, _ in rows_list]
242
255
  self.vector_store.add_nodes(
243
256
  node_ids, embeddings, metadatas, documents,
244
257
  )
245
258
  except Exception as exc:
246
259
  logger.warning("Failed to sync nodes to LanceDB: %s", exc)
247
260
 
261
+ # Write to model-specific table
262
+ if model_key:
263
+ model_vs = self.get_vector_store_for_model(model_key)
264
+ if model_vs is not None:
265
+ try:
266
+ model_vs.add_nodes(
267
+ node_ids, embeddings, metadatas, documents,
268
+ )
269
+ except Exception as exc:
270
+ logger.warning(
271
+ "Failed to sync nodes to model table '%s': %s",
272
+ model_key, exc,
273
+ )
274
+
248
275
  def insert_edges(self, edges: Iterable[Edge]) -> None:
249
276
  cur = self.conn.cursor()
250
277
  cur.executemany(
@@ -253,6 +280,132 @@ class GraphStore:
253
280
  )
254
281
  self.conn.commit()
255
282
 
283
+ # ------------------------------------------------------------------
284
+ # Incremental index (single-file add / remove)
285
+ # ------------------------------------------------------------------
286
+
287
+ def remove_nodes_for_file(self, rel_path: str) -> int:
288
+ """Remove all nodes and related edges for a specific file.
289
+
290
+ Clears data from SQLite **and** every known LanceDB table
291
+ (legacy + per-model).
292
+
293
+ Args:
294
+ rel_path: Relative file path as stored in the ``file_path``
295
+ column (e.g. ``"src/utils.py"``).
296
+
297
+ Returns:
298
+ Number of SQLite node rows deleted.
299
+ """
300
+ # 1. Collect node IDs that belong to this file
301
+ cur = self.conn.cursor()
302
+ rows = cur.execute(
303
+ "SELECT node_id FROM nodes WHERE file_path = ?", (rel_path,),
304
+ ).fetchall()
305
+ node_ids = [r[0] for r in rows]
306
+
307
+ if not node_ids:
308
+ return 0
309
+
310
+ # 2. Delete edges referencing these nodes (src OR dst)
311
+ placeholders = ",".join("?" * len(node_ids))
312
+ cur.execute(
313
+ f"DELETE FROM edges WHERE src IN ({placeholders}) OR dst IN ({placeholders})",
314
+ node_ids + node_ids,
315
+ )
316
+ # 3. Delete nodes themselves
317
+ cur.execute(
318
+ f"DELETE FROM nodes WHERE node_id IN ({placeholders})",
319
+ node_ids,
320
+ )
321
+ self.conn.commit()
322
+
323
+ # 4. Remove from legacy LanceDB table
324
+ if self.vector_store is not None:
325
+ try:
326
+ self.vector_store.delete_by_file_path(rel_path)
327
+ except Exception as exc:
328
+ logger.debug("Legacy vector delete for '%s': %s", rel_path, exc)
329
+
330
+ # 5. Remove from all per-model LanceDB tables
331
+ for _key, vs in self._model_vector_stores.items():
332
+ try:
333
+ vs.delete_by_file_path(rel_path)
334
+ except Exception:
335
+ pass
336
+
337
+ # Also try tables that haven't been opened yet
338
+ if VECTOR_STORE_AVAILABLE:
339
+ try:
340
+ probe = VectorStore(self.project_dir, model_key="")
341
+ for mk in probe.list_model_tables():
342
+ if mk and mk not in self._model_vector_stores:
343
+ try:
344
+ vs = VectorStore(self.project_dir, model_key=mk)
345
+ vs.delete_by_file_path(rel_path)
346
+ except Exception:
347
+ pass
348
+ except Exception:
349
+ pass
350
+
351
+ return len(node_ids)
352
+
353
+ def index_single_file(
354
+ self,
355
+ file_path: Path,
356
+ project_root: Path,
357
+ embedder: Any,
358
+ model_key: str = "",
359
+ ) -> int:
360
+ """Parse and index a single file incrementally.
361
+
362
+ Removes old nodes/edges for the file, parses it fresh,
363
+ embeds the new nodes, and inserts them.
364
+
365
+ Args:
366
+ file_path: Absolute path to the source file.
367
+ project_root: Project root (for computing relative paths).
368
+ embedder: Object with ``embed_text(str) -> List[float]``.
369
+ model_key: Embedding model identifier.
370
+
371
+ Returns:
372
+ Number of nodes indexed for this file.
373
+ """
374
+ from .parser import PythonGraphParser
375
+ from .agents import _build_chunk_text
376
+
377
+ rel_path = str(file_path.relative_to(project_root))
378
+
379
+ # Remove stale data for this file
380
+ self.remove_nodes_for_file(rel_path)
381
+
382
+ # Parse the single file
383
+ parser = PythonGraphParser(project_root)
384
+ try:
385
+ nodes, edges = parser.parse_file(file_path)
386
+ except Exception as exc:
387
+ logger.warning("Failed to parse %s: %s", file_path, exc)
388
+ return 0
389
+
390
+ if not nodes:
391
+ return 0
392
+
393
+ # Embed and insert
394
+ node_payload = []
395
+ for node in nodes:
396
+ text = _build_chunk_text(node)
397
+ emb = embedder.embed_text(text)
398
+ node_payload.append((node, emb))
399
+
400
+ self.insert_nodes(node_payload, model_key=model_key)
401
+ self.insert_edges(edges)
402
+
403
+ logger.info(
404
+ "Incremental index: %d nodes, %d edges for %s",
405
+ len(nodes), len(edges), rel_path,
406
+ )
407
+ return len(nodes)
408
+
256
409
  # ------------------------------------------------------------------
257
410
  # Read (structured)
258
411
  # ------------------------------------------------------------------
@@ -388,3 +541,146 @@ class GraphStore:
388
541
  edge_rows,
389
542
  )
390
543
  self.conn.commit()
544
+
545
+ # ------------------------------------------------------------------
546
+ # Per-model vector stores (auto re-ingestion)
547
+ # ------------------------------------------------------------------
548
+
549
+ def get_vector_store_for_model(self, model_key: str) -> Optional["VectorStore"]:
550
+ """Get (or create) a LanceDB vector store for a specific embedding model.
551
+
552
+ Each embedding model gets its own LanceDB table so that
553
+ different dimensionalities never collide. The table is named
554
+ ``code_nodes_{model_key}``.
555
+
556
+ Returns ``None`` when LanceDB is not available.
557
+ """
558
+ if not VECTOR_STORE_AVAILABLE:
559
+ return None
560
+ if model_key in self._model_vector_stores:
561
+ return self._model_vector_stores[model_key]
562
+ try:
563
+ vs = VectorStore(self.project_dir, model_key=model_key)
564
+ self._model_vector_stores[model_key] = vs
565
+ return vs
566
+ except Exception as exc:
567
+ logger.warning(
568
+ "Cannot create vector store for model '%s': %s", model_key, exc,
569
+ )
570
+ return None
571
+
572
+ def reingest_for_model(
573
+ self,
574
+ model_key: str,
575
+ embedder: Any,
576
+ chunk_builder: Any = None,
577
+ ) -> int:
578
+ """Re-embed all SQLite nodes into a model-specific LanceDB table.
579
+
580
+ Reads raw code/metadata from the SQLite ``nodes`` table,
581
+ computes embeddings with *embedder*, and writes them into the
582
+ LanceDB table for *model_key*.
583
+
584
+ Args:
585
+ model_key: Embedding model identifier (e.g. ``"minilm"``).
586
+ embedder: Object with an ``embed_text(str) -> List[float]``
587
+ method (and optionally ``embed_documents``).
588
+ chunk_builder: Optional callable ``(dict) -> str`` that builds
589
+ the text chunk from a node row dict. Falls back
590
+ to an internal default.
591
+
592
+ Returns:
593
+ Number of nodes ingested.
594
+ """
595
+ vs = self.get_vector_store_for_model(model_key)
596
+ if vs is None:
597
+ return 0
598
+
599
+ rows = self.get_nodes()
600
+ if not rows:
601
+ return 0
602
+
603
+ if chunk_builder is None:
604
+ chunk_builder = _default_chunk_builder
605
+
606
+ # Clear old data for this model's table and re-open
607
+ vs.clear()
608
+ self._model_vector_stores.pop(model_key, None)
609
+ vs = self.get_vector_store_for_model(model_key)
610
+ if vs is None:
611
+ return 0
612
+
613
+ node_ids: List[str] = []
614
+ embeddings: List[List[float]] = []
615
+ metadatas: List[Dict[str, str]] = []
616
+ documents: List[str] = []
617
+ texts: List[str] = []
618
+
619
+ for row in rows:
620
+ row_dict = dict(row)
621
+ text = chunk_builder(row_dict)
622
+ texts.append(text)
623
+ node_ids.append(row_dict["node_id"])
624
+ metadatas.append({
625
+ "node_type": row_dict["node_type"],
626
+ "file_path": row_dict["file_path"],
627
+ "qualname": row_dict["qualname"],
628
+ "name": row_dict["name"],
629
+ })
630
+ documents.append(row_dict["code"])
631
+
632
+ # Batch-embed when possible, single-embed otherwise
633
+ if hasattr(embedder, "embed_documents"):
634
+ embeddings = embedder.embed_documents(texts)
635
+ else:
636
+ embeddings = [embedder.embed_text(t) for t in texts]
637
+
638
+ try:
639
+ vs.add_nodes(node_ids, embeddings, metadatas, documents)
640
+ logger.info(
641
+ "Re-ingested %d nodes into table for embedding model '%s'.",
642
+ len(node_ids), model_key,
643
+ )
644
+ except Exception as exc:
645
+ logger.warning("Re-ingestion for model '%s' failed: %s", model_key, exc)
646
+ return 0
647
+
648
+ return len(node_ids)
649
+
650
+
651
+ # ===================================================================
652
+ # Helpers
653
+ # ===================================================================
654
+
655
+ # Regex to strip bare import lines from chunk text (mirrors agents._IMPORT_RE)
656
+ _CHUNK_IMPORT_RE = re.compile(r"^(?:from\s+\S+\s+)?import\s+.+$", re.MULTILINE)
657
+ _MAX_CHUNK_CODE = 1500
658
+
659
+
660
+ def _default_chunk_builder(row: Dict[str, Any]) -> str:
661
+ """Build embedding text from a SQLite node row dict.
662
+
663
+ Mirrors :func:`codegraph_cli.agents._build_chunk_text` but works
664
+ with plain dicts instead of :class:`Node` objects.
665
+ """
666
+ parts: List[str] = [
667
+ f"file: {row['file_path']}",
668
+ f"symbol: {row['qualname']}",
669
+ f"type: {row['node_type']}",
670
+ ]
671
+ docstring = row.get("docstring") or ""
672
+ if docstring.strip():
673
+ parts.append(f"doc: {docstring.strip()}")
674
+
675
+ code: str = row.get("code", "")
676
+ if row["node_type"] != "module":
677
+ code = _CHUNK_IMPORT_RE.sub("", code).strip()
678
+ else:
679
+ code = code[:_MAX_CHUNK_CODE]
680
+
681
+ if len(code) > _MAX_CHUNK_CODE:
682
+ code = code[:_MAX_CHUNK_CODE] + "\n# ... (truncated)"
683
+ if code:
684
+ parts.append(code)
685
+
686
+ return "\n".join(parts)
@@ -49,22 +49,26 @@ class VectorStore:
49
49
  ======== ============ =====================================
50
50
  """
51
51
 
52
- def __init__(self, project_dir: Path) -> None:
52
+ def __init__(self, project_dir: Path, model_key: str = "") -> None:
53
53
  if not LANCE_AVAILABLE:
54
54
  raise ImportError(
55
55
  "lancedb is not installed. Install with: pip install lancedb pyarrow"
56
56
  )
57
57
 
58
58
  self.project_dir = project_dir
59
+ self.model_key = model_key
59
60
  self._lance_dir = project_dir / "lancedb"
60
61
  self._lance_dir.mkdir(exist_ok=True, parents=True)
61
62
 
63
+ # Each embedding model gets its own table to avoid dimension conflicts
64
+ self._table_name = f"code_nodes_{model_key}" if model_key else "code_nodes"
65
+
62
66
  self._db: Any = lancedb.connect(str(self._lance_dir))
63
67
  self._table: Optional[Any] = None
64
68
 
65
69
  # Try to open existing table
66
70
  try:
67
- self._table = self._db.open_table("code_nodes")
71
+ self._table = self._db.open_table(self._table_name)
68
72
  except Exception:
69
73
  self._table = None
70
74
 
@@ -106,7 +110,7 @@ class VectorStore:
106
110
  if self._table is None:
107
111
  # First insert – create the table (schema inferred from data)
108
112
  self._table = self._db.create_table(
109
- "code_nodes", data=rows, mode="overwrite",
113
+ self._table_name, data=rows, mode="overwrite",
110
114
  )
111
115
  else:
112
116
  # Subsequent inserts – upsert by deleting old IDs first
@@ -150,7 +154,12 @@ class VectorStore:
150
154
  return empty
151
155
 
152
156
  try:
153
- query = self._table.search(query_embedding).limit(n_results)
157
+ query = (
158
+ self._table
159
+ .search(query_embedding)
160
+ .metric("cosine")
161
+ .limit(n_results)
162
+ )
154
163
 
155
164
  # Apply metadata filters as SQL WHERE clause
156
165
  if where:
@@ -171,8 +180,11 @@ class VectorStore:
171
180
  docs: List[str] = []
172
181
 
173
182
  for row in results:
183
+ # With cosine metric, _distance is the *cosine distance*
184
+ # (1 − cos_sim), so values are in [0, 2].
185
+ dist = row.get("_distance", 0.0)
174
186
  ids.append(row.get("id", ""))
175
- distances.append(row.get("_distance", 0.0))
187
+ distances.append(dist)
176
188
  metas.append({
177
189
  "node_type": row.get("node_type", ""),
178
190
  "file_path": row.get("file_path", ""),
@@ -209,7 +221,12 @@ class VectorStore:
209
221
  return []
210
222
 
211
223
  try:
212
- query = self._table.search(query_embedding).limit(n_results)
224
+ query = (
225
+ self._table
226
+ .search(query_embedding)
227
+ .metric("cosine")
228
+ .limit(n_results)
229
+ )
213
230
  if where_sql:
214
231
  query = query.where(where_sql)
215
232
  return query.to_list()
@@ -226,7 +243,7 @@ class VectorStore:
226
243
  if self._table is None:
227
244
  return None
228
245
  try:
229
- import pandas as pd # type: ignore[import-untyped]
246
+ import pandas as pd # type: ignore[import-untyped] # noqa: F811
230
247
  df: pd.DataFrame = self._table.to_pandas()
231
248
  match = df[df["id"] == node_id]
232
249
  if match.empty:
@@ -260,14 +277,58 @@ class VectorStore:
260
277
  except Exception:
261
278
  pass
262
279
 
280
+ def delete_by_file_path(self, file_path: str) -> int:
281
+ """Delete all nodes belonging to a specific file.
282
+
283
+ Args:
284
+ file_path: Relative file path (must match the ``file_path``
285
+ column stored during indexing).
286
+
287
+ Returns:
288
+ Number of rows deleted (0 if table is empty / missing).
289
+ """
290
+ if self._table is None:
291
+ return 0
292
+ try:
293
+ before = self._table.count_rows()
294
+ # Escape single quotes in the path to avoid SQL injection
295
+ safe_path = file_path.replace("'", "''")
296
+ self._table.delete(f"file_path = '{safe_path}'")
297
+ after = self._table.count_rows()
298
+ return max(0, before - after)
299
+ except Exception as exc:
300
+ logger.warning(
301
+ "delete_by_file_path('%s') failed: %s", file_path, exc,
302
+ )
303
+ return 0
304
+
263
305
  def clear(self) -> None:
264
306
  """Drop all data and recreate an empty table."""
265
307
  try:
266
- self._db.drop_table("code_nodes")
308
+ self._db.drop_table(self._table_name)
267
309
  except Exception:
268
310
  pass
269
311
  self._table = None
270
312
 
313
+ def list_model_tables(self) -> List[str]:
314
+ """Return model keys for which a LanceDB table exists.
315
+
316
+ Tables are named ``code_nodes_{model_key}``; this method strips
317
+ the prefix and returns just the model keys.
318
+ """
319
+ try:
320
+ all_tables = self._db.table_names()
321
+ except Exception:
322
+ return []
323
+ models: List[str] = []
324
+ prefix = "code_nodes_"
325
+ for name in all_tables:
326
+ if name == "code_nodes":
327
+ models.append("") # legacy table
328
+ elif name.startswith(prefix):
329
+ models.append(name[len(prefix):])
330
+ return models
331
+
271
332
  # ------------------------------------------------------------------
272
333
  # Informational
273
334
  # ------------------------------------------------------------------
@@ -291,3 +352,44 @@ class VectorStore:
291
352
  return df.head(limit).to_dict(orient="records")
292
353
  except Exception:
293
354
  return []
355
+
356
+ def debug_search(
357
+ self,
358
+ query_embedding: List[float],
359
+ n_results: int = 5,
360
+ ) -> List[Dict[str, Any]]:
361
+ """Diagnostic search returning raw scores and distance details.
362
+
363
+ Unlike :meth:`search`, this returns a flat list of dicts that
364
+ includes the raw ``_distance`` value, the derived similarity
365
+ score, and key metadata — useful for inspecting retrieval
366
+ quality from the CLI.
367
+ """
368
+ if self._table is None:
369
+ return []
370
+ try:
371
+ results = (
372
+ self._table
373
+ .search(query_embedding)
374
+ .metric("cosine")
375
+ .limit(n_results)
376
+ .to_list()
377
+ )
378
+ except Exception as exc:
379
+ logger.warning("debug_search failed: %s", exc)
380
+ return []
381
+
382
+ out: List[Dict[str, Any]] = []
383
+ for row in results:
384
+ dist = row.get("_distance", 0.0)
385
+ out.append({
386
+ "id": row.get("id", ""),
387
+ "name": row.get("name", ""),
388
+ "qualname": row.get("qualname", ""),
389
+ "node_type": row.get("node_type", ""),
390
+ "file_path": row.get("file_path", ""),
391
+ "cosine_distance": round(dist, 5),
392
+ "similarity_score": round(max(0.0, 1.0 - dist), 5),
393
+ "document_preview": (row.get("document", "") or "")[:120],
394
+ })
395
+ return out