codegraph-cli 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. codegraph_cli/__init__.py +1 -1
  2. codegraph_cli/agents.py +59 -3
  3. codegraph_cli/chat_agent.py +58 -11
  4. codegraph_cli/cli.py +569 -54
  5. codegraph_cli/cli_chat.py +204 -94
  6. codegraph_cli/cli_diagnose.py +13 -2
  7. codegraph_cli/cli_docs.py +207 -0
  8. codegraph_cli/cli_explore.py +1053 -0
  9. codegraph_cli/cli_export.py +941 -0
  10. codegraph_cli/cli_groups.py +33 -0
  11. codegraph_cli/cli_health.py +316 -0
  12. codegraph_cli/cli_history.py +213 -0
  13. codegraph_cli/cli_onboard.py +380 -0
  14. codegraph_cli/cli_quickstart.py +256 -0
  15. codegraph_cli/cli_refactor.py +17 -3
  16. codegraph_cli/cli_setup.py +12 -12
  17. codegraph_cli/cli_suggestions.py +90 -0
  18. codegraph_cli/cli_test.py +17 -3
  19. codegraph_cli/cli_tui.py +210 -0
  20. codegraph_cli/cli_v2.py +24 -4
  21. codegraph_cli/cli_watch.py +158 -0
  22. codegraph_cli/cli_workflows.py +255 -0
  23. codegraph_cli/codegen_agent.py +15 -1
  24. codegraph_cli/config.py +18 -5
  25. codegraph_cli/context_manager.py +117 -15
  26. codegraph_cli/crew_agents.py +32 -8
  27. codegraph_cli/crew_chat.py +146 -13
  28. codegraph_cli/crew_tools.py +30 -2
  29. codegraph_cli/embeddings.py +95 -5
  30. codegraph_cli/llm.py +42 -55
  31. codegraph_cli/project_context.py +64 -1
  32. codegraph_cli/rag.py +282 -19
  33. codegraph_cli/storage.py +310 -14
  34. codegraph_cli/vector_store.py +110 -8
  35. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/METADATA +75 -21
  36. codegraph_cli-2.1.2.dist-info/RECORD +55 -0
  37. codegraph_cli-2.1.2.dist-info/entry_points.txt +2 -0
  38. codegraph_cli-2.1.0.dist-info/RECORD +0 -43
  39. codegraph_cli-2.1.0.dist-info/entry_points.txt +0 -2
  40. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/WHEEL +0 -0
  41. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/licenses/LICENSE +0 -0
  42. {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/top_level.txt +0 -0
codegraph_cli/rag.py CHANGED
@@ -3,13 +3,30 @@
3
3
  Uses LanceDB hybrid search (vector + metadata filters) for fast,
4
4
  accurate code retrieval. Falls back to brute-force cosine similarity
5
5
  when the vector store is unavailable.
6
+
7
+ Improvements over the original implementation:
8
+
9
+ - **Cosine metric** — LanceDB searches use ``metric="cosine"`` so
10
+ ``_distance`` values are true cosine distances (``1 − cos_sim``).
11
+ - **Minimum score threshold** — results below a configurable quality
12
+ floor are discarded before returning to callers.
13
+ - **Graph-neighbour augmentation** — after the initial semantic top-k,
14
+ direct dependency neighbours of the best hits are fetched from the
15
+ graph store and merged (de-duplicated) into the result set.
16
+ - **Result caching** — a small LRU dict avoids re-computing identical
17
+ queries within the same session.
18
+ - **Context compression** — :meth:`retrieve_context` strips import
19
+ lines, trims excessively long snippets, and formats structured
20
+ metadata so the LLM receives clean, information-dense context.
6
21
  """
7
22
 
8
23
  from __future__ import annotations
9
24
 
10
25
  import json
11
26
  import logging
12
- from typing import Any, Dict, List, Optional, Union
27
+ import re
28
+ from collections import OrderedDict
29
+ from typing import Any, Dict, List, Optional, Set, Union
13
30
 
14
31
  from .embeddings import HashEmbeddingModel, TransformerEmbedder, cosine_similarity
15
32
  from .models import SearchResult
@@ -17,16 +34,33 @@ from .storage import GraphStore
17
34
 
18
35
  logger = logging.getLogger(__name__)
19
36
 
37
+ # Minimum similarity score (0..1) below which results are dropped.
38
+ MIN_SCORE_THRESHOLD: float = 0.05
39
+
40
+ # Import line regex — used to strip bare imports from context snippets.
41
+ _IMPORT_RE = re.compile(r"^(?:from\s+\S+\s+)?import\s+.+\n?", re.MULTILINE)
42
+
43
+ # Max characters per snippet in formatted context output.
44
+ _MAX_SNIPPET_CHARS = 1000
45
+
46
+ # Max entries in the per-session query cache.
47
+ _CACHE_SIZE = 64
48
+
20
49
 
21
50
  class RAGRetriever:
22
51
  """Retrieve relevant code nodes from graph memory via semantic similarity.
23
52
 
24
53
  Supports two modes:
25
54
 
26
- 1. **Vector store mode** (fast, preferred) delegates to LanceDB via
27
- ``GraphStore.vector_store``.
28
- 2. **Brute-force mode** (fallback) scans all SQLite rows and computes
29
- cosine similarity in Python.
55
+ 1. **Vector store mode** (fast, preferred) delegates to a
56
+ **model-specific** LanceDB table via ``GraphStore``. Each
57
+ embedding model gets its own table so dimension mismatches
58
+ cannot occur. If no table exists for the current model, the
59
+ retriever automatically re-embeds all nodes from SQLite
60
+ (one-time, transparent to the caller).
61
+ 2. **Brute-force mode** (fallback) — scans all SQLite rows and
62
+ computes cosine similarity in Python. Used only when LanceDB
63
+ is not installed at all.
30
64
 
31
65
  The ``embedding_model`` argument accepts either a
32
66
  :class:`~codegraph_cli.embeddings.TransformerEmbedder` or the lightweight
@@ -37,10 +71,89 @@ class RAGRetriever:
37
71
  self,
38
72
  store: GraphStore,
39
73
  embedding_model: Union[TransformerEmbedder, HashEmbeddingModel, Any],
74
+ min_score: float = MIN_SCORE_THRESHOLD,
75
+ enable_graph_augment: bool = True,
40
76
  ) -> None:
41
77
  self.store = store
42
78
  self.embedding_model = embedding_model
43
- self.use_vector_store: bool = store.vector_store is not None
79
+ self.min_score = min_score
80
+ self.enable_graph_augment = enable_graph_augment
81
+
82
+ # Resolve the model-specific vector store
83
+ self._model_key: str = getattr(embedding_model, "model_key", "hash")
84
+ self._model_vs: Optional[Any] = None
85
+ self.use_vector_store: bool = False
86
+ self._init_model_vector_store()
87
+
88
+ # Simple LRU cache: query_text → List[SearchResult]
89
+ self._cache: OrderedDict[str, List[SearchResult]] = OrderedDict()
90
+
91
+ # ------------------------------------------------------------------
92
+ # Model-specific vector store initialisation
93
+ # ------------------------------------------------------------------
94
+
95
+ def _init_model_vector_store(self) -> None:
96
+ """Obtain the LanceDB table for the current embedding model.
97
+
98
+ If the table doesn't exist or is empty, trigger a one-time
99
+ re-ingestion from SQLite so every model always has its own
100
+ properly-dimensioned vector store.
101
+ """
102
+ self._model_vs = self.store.get_vector_store_for_model(self._model_key)
103
+ if self._model_vs is None:
104
+ # LanceDB not available — fall back to brute-force
105
+ self.use_vector_store = False
106
+ return
107
+
108
+ table_ready = (
109
+ getattr(self._model_vs, "_table", None) is not None
110
+ and self._model_vs.count() > 0
111
+ )
112
+
113
+ if not table_ready:
114
+ # Table is empty / missing — auto re-ingest from SQLite
115
+ node_count = self.store.get_nodes()
116
+ if node_count:
117
+ logger.info(
118
+ "No vector table for model '%s' — re-ingesting %d nodes…",
119
+ self._model_key, len(node_count),
120
+ )
121
+ n = self.store.reingest_for_model(
122
+ self._model_key, self.embedding_model,
123
+ )
124
+ if n > 0:
125
+ # Refresh the reference after ingestion
126
+ self._model_vs = self.store.get_vector_store_for_model(
127
+ self._model_key,
128
+ )
129
+
130
+ self.use_vector_store = (
131
+ self._model_vs is not None
132
+ and getattr(self._model_vs, "_table", None) is not None
133
+ )
134
+
135
+ # ------------------------------------------------------------------
136
+ # Cache helpers
137
+ # ------------------------------------------------------------------
138
+
139
+ def _cache_key(self, query: str, top_k: int, node_type: Optional[str]) -> str:
140
+ return f"{query}||{top_k}||{node_type or ''}"
141
+
142
+ def _cache_get(self, key: str) -> Optional[List[SearchResult]]:
143
+ if key in self._cache:
144
+ self._cache.move_to_end(key)
145
+ return self._cache[key]
146
+ return None
147
+
148
+ def _cache_put(self, key: str, value: List[SearchResult]) -> None:
149
+ self._cache[key] = value
150
+ self._cache.move_to_end(key)
151
+ while len(self._cache) > _CACHE_SIZE:
152
+ self._cache.popitem(last=False)
153
+
154
+ def clear_cache(self) -> None:
155
+ """Flush the query result cache."""
156
+ self._cache.clear()
44
157
 
45
158
  # ------------------------------------------------------------------
46
159
  # Primary search
@@ -65,13 +178,39 @@ class RAGRetriever:
65
178
  Returns:
66
179
  List of :class:`SearchResult` sorted by relevance (highest first).
67
180
  """
181
+ ck = self._cache_key(query, top_k, node_type)
182
+ cached = self._cache_get(ck)
183
+ if cached is not None:
184
+ return cached
185
+
68
186
  query_emb: List[float] = self.embedding_model.embed_text(query)
69
187
 
70
188
  if self.use_vector_store:
71
- return self._search_vector_store(
189
+ results = self._search_vector_store(
72
190
  query_emb, top_k, node_type, file_filter,
73
191
  )
74
- return self._search_brute_force(query_emb, top_k, node_type)
192
+ # Fall back to brute-force if the vector store returned nothing
193
+ # (e.g. empty table, dimension mismatch, or LanceDB error).
194
+ if not results:
195
+ results = self._search_brute_force(query_emb, top_k, node_type)
196
+ else:
197
+ results = self._search_brute_force(query_emb, top_k, node_type)
198
+
199
+ # ── Graph-neighbour augmentation ────────────────────────
200
+ if self.enable_graph_augment and results:
201
+ results = self._augment_with_graph_neighbours(
202
+ results, query_emb, top_k,
203
+ )
204
+
205
+ # ── Minimum-score gate ──────────────────────────────────
206
+ results = [r for r in results if r.score >= self.min_score]
207
+
208
+ # ── Final sort & trim ───────────────────────────────────
209
+ results.sort(key=lambda r: r.score, reverse=True)
210
+ results = results[:top_k]
211
+
212
+ self._cache_put(ck, results)
213
+ return results
75
214
 
76
215
  # ------------------------------------------------------------------
77
216
  # LanceDB path (fast)
@@ -84,7 +223,7 @@ class RAGRetriever:
84
223
  node_type: Optional[str],
85
224
  file_filter: Optional[str],
86
225
  ) -> List[SearchResult]:
87
- assert self.store.vector_store is not None
226
+ assert self._model_vs is not None
88
227
 
89
228
  # Build SQL WHERE clause for hybrid search
90
229
  clauses: List[str] = []
@@ -94,7 +233,7 @@ class RAGRetriever:
94
233
  clauses.append(f'file_path LIKE "{file_filter}"')
95
234
  where_sql = " AND ".join(clauses) if clauses else None
96
235
 
97
- raw_results = self.store.vector_store.hybrid_search(
236
+ raw_results = self._model_vs.hybrid_search(
98
237
  query_embedding=query_emb,
99
238
  n_results=top_k,
100
239
  where_sql=where_sql,
@@ -103,10 +242,9 @@ class RAGRetriever:
103
242
  results: List[SearchResult] = []
104
243
  for row in raw_results:
105
244
  distance = row.get("_distance", 0.0)
106
- # LanceDB returns L2 distance by default; convert to a similarity
107
- # score in [0, 1]. For cosine distance the relationship is
108
- # score = 1 - distance (since embeddings are unit-normalised).
109
- score = max(0.0, 1.0 - distance)
245
+ # With cosine metric, _distance is cosine distance [0, 2].
246
+ # Similarity = 1 − distance, clamped to [0, 1].
247
+ score = max(0.0, min(1.0, 1.0 - distance))
110
248
 
111
249
  # Enrich from SQLite if full node data is needed
112
250
  node_row = self.store.get_node(row.get("id", ""))
@@ -148,12 +286,18 @@ class RAGRetriever:
148
286
  node_type: Optional[str],
149
287
  ) -> List[SearchResult]:
150
288
  results: List[SearchResult] = []
289
+ query_dim = len(query_emb)
151
290
  for row in self.store.get_nodes():
152
291
  if node_type and row["node_type"] != node_type:
153
292
  continue
154
293
  embedding = json.loads(row["embedding"] or "[]")
294
+ if not embedding:
295
+ continue
296
+ # Skip rows whose stored embedding dimension doesn't match
297
+ if len(embedding) != query_dim:
298
+ continue
155
299
  score = cosine_similarity(query_emb, embedding)
156
- if score <= 0:
300
+ if score < self.min_score:
157
301
  continue
158
302
  results.append(SearchResult(
159
303
  node_id=row["node_id"],
@@ -168,6 +312,77 @@ class RAGRetriever:
168
312
 
169
313
  return sorted(results, key=lambda r: r.score, reverse=True)[:top_k]
170
314
 
315
+ # ------------------------------------------------------------------
316
+ # Graph-neighbour augmentation
317
+ # ------------------------------------------------------------------
318
+
319
+ def _augment_with_graph_neighbours(
320
+ self,
321
+ results: List[SearchResult],
322
+ query_emb: List[float],
323
+ max_total: int,
324
+ ) -> List[SearchResult]:
325
+ """Expand the result set with direct dependency neighbours.
326
+
327
+ For the top-3 semantic hits, fetch their outgoing and incoming
328
+ graph edges and score the neighbour nodes against the query.
329
+ Merge into *results* (deduplicated by ``node_id``).
330
+ """
331
+ seen_ids: Set[str] = {r.node_id for r in results}
332
+ extra: List[SearchResult] = []
333
+
334
+ # Only augment from the best 3 hits to keep it fast
335
+ for sr in results[:3]:
336
+ for edge in self.store.neighbors(sr.node_id):
337
+ dst_id = edge["dst"]
338
+ if dst_id in seen_ids:
339
+ continue
340
+ seen_ids.add(dst_id)
341
+ node_row = self.store.get_node(dst_id)
342
+ if node_row is None:
343
+ continue
344
+ emb = json.loads(node_row["embedding"] or "[]")
345
+ if emb:
346
+ score = cosine_similarity(query_emb, emb)
347
+ else:
348
+ score = sr.score * 0.3
349
+ extra.append(SearchResult(
350
+ node_id=node_row["node_id"],
351
+ score=score,
352
+ node_type=node_row["node_type"],
353
+ qualname=node_row["qualname"],
354
+ file_path=node_row["file_path"],
355
+ start_line=node_row["start_line"],
356
+ end_line=node_row["end_line"],
357
+ snippet=node_row["code"],
358
+ ))
359
+
360
+ for edge in self.store.reverse_neighbors(sr.node_id):
361
+ src_id = edge["src"]
362
+ if src_id in seen_ids:
363
+ continue
364
+ seen_ids.add(src_id)
365
+ node_row = self.store.get_node(src_id)
366
+ if node_row is None:
367
+ continue
368
+ emb = json.loads(node_row["embedding"] or "[]")
369
+ if emb:
370
+ score = cosine_similarity(query_emb, emb)
371
+ else:
372
+ score = sr.score * 0.3
373
+ extra.append(SearchResult(
374
+ node_id=node_row["node_id"],
375
+ score=score,
376
+ node_type=node_row["node_type"],
377
+ qualname=node_row["qualname"],
378
+ file_path=node_row["file_path"],
379
+ start_line=node_row["start_line"],
380
+ end_line=node_row["end_line"],
381
+ snippet=node_row["code"],
382
+ ))
383
+
384
+ return results + extra
385
+
171
386
  # ------------------------------------------------------------------
172
387
  # Convenience
173
388
  # ------------------------------------------------------------------
@@ -182,6 +397,8 @@ class RAGRetriever:
182
397
  """Return a formatted string of the top search results.
183
398
 
184
399
  Useful for injecting relevant code context into LLM prompts.
400
+ Applies context compression: strips imports, trims long code,
401
+ and formats structured metadata.
185
402
  """
186
403
  matches = self.search(
187
404
  query, top_k=top_k, node_type=node_type, file_filter=file_filter,
@@ -191,10 +408,56 @@ class RAGRetriever:
191
408
 
192
409
  blocks: List[str] = []
193
410
  for item in matches:
411
+ snippet = _compress_snippet(item.snippet)
194
412
  blocks.append(
195
- f"[{item.node_type}] {item.qualname} "
196
- f"({item.file_path}:{item.start_line})\n"
197
- f"Score: {item.score:.3f}\n"
198
- f"```python\n{item.snippet[:1200]}\n```"
413
+ f"[{item.node_type}] {item.qualname}\n"
414
+ f"file: {item.file_path}:{item.start_line}\n"
415
+ f"score: {item.score:.3f}\n"
416
+ f"```python\n{snippet}\n```"
199
417
  )
200
418
  return "\n\n".join(blocks)
419
+
420
+ # ------------------------------------------------------------------
421
+ # Debug helper
422
+ # ------------------------------------------------------------------
423
+
424
+ def debug_search(
425
+ self,
426
+ query: str,
427
+ top_k: int = 10,
428
+ ) -> List[Dict[str, Any]]:
429
+ """Diagnostic search — returns raw dicts with full scoring info.
430
+
431
+ Intended for ``cg debug-rag`` CLI command.
432
+ """
433
+ results = self.search(query, top_k=top_k)
434
+ out: List[Dict[str, Any]] = []
435
+ for r in results:
436
+ out.append({
437
+ "node_id": r.node_id,
438
+ "qualname": r.qualname,
439
+ "node_type": r.node_type,
440
+ "file_path": r.file_path,
441
+ "score": round(r.score, 5),
442
+ "lines": f"{r.start_line}-{r.end_line}",
443
+ "snippet_len": len(r.snippet),
444
+ })
445
+ return out
446
+
447
+
448
+ # ===================================================================
449
+ # Context compression utilities
450
+ # ===================================================================
451
+
452
+ def _compress_snippet(code: str, max_chars: int = _MAX_SNIPPET_CHARS) -> str:
453
+ """Clean and truncate a code snippet for LLM context.
454
+
455
+ 1. Strip bare import lines (the LLM rarely needs them).
456
+ 2. Collapse runs of blank lines.
457
+ 3. Truncate to *max_chars*.
458
+ """
459
+ code = _IMPORT_RE.sub("", code)
460
+ code = re.sub(r"\n{3,}", "\n\n", code).strip()
461
+ if len(code) > max_chars:
462
+ code = code[:max_chars] + "\n# ... (truncated)"
463
+ return code