codegraph-cli 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_cli/__init__.py +1 -1
- codegraph_cli/agents.py +59 -3
- codegraph_cli/chat_agent.py +58 -11
- codegraph_cli/cli.py +569 -54
- codegraph_cli/cli_chat.py +204 -94
- codegraph_cli/cli_diagnose.py +13 -2
- codegraph_cli/cli_docs.py +207 -0
- codegraph_cli/cli_explore.py +1053 -0
- codegraph_cli/cli_export.py +941 -0
- codegraph_cli/cli_groups.py +33 -0
- codegraph_cli/cli_health.py +316 -0
- codegraph_cli/cli_history.py +213 -0
- codegraph_cli/cli_onboard.py +380 -0
- codegraph_cli/cli_quickstart.py +256 -0
- codegraph_cli/cli_refactor.py +17 -3
- codegraph_cli/cli_setup.py +12 -12
- codegraph_cli/cli_suggestions.py +90 -0
- codegraph_cli/cli_test.py +17 -3
- codegraph_cli/cli_tui.py +210 -0
- codegraph_cli/cli_v2.py +24 -4
- codegraph_cli/cli_watch.py +158 -0
- codegraph_cli/cli_workflows.py +255 -0
- codegraph_cli/codegen_agent.py +15 -1
- codegraph_cli/config.py +18 -5
- codegraph_cli/context_manager.py +117 -15
- codegraph_cli/crew_agents.py +32 -8
- codegraph_cli/crew_chat.py +146 -13
- codegraph_cli/crew_tools.py +30 -2
- codegraph_cli/embeddings.py +95 -5
- codegraph_cli/llm.py +42 -55
- codegraph_cli/project_context.py +64 -1
- codegraph_cli/rag.py +282 -19
- codegraph_cli/storage.py +310 -14
- codegraph_cli/vector_store.py +110 -8
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/METADATA +75 -21
- codegraph_cli-2.1.2.dist-info/RECORD +55 -0
- codegraph_cli-2.1.2.dist-info/entry_points.txt +2 -0
- codegraph_cli-2.1.0.dist-info/RECORD +0 -43
- codegraph_cli-2.1.0.dist-info/entry_points.txt +0 -2
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/WHEEL +0 -0
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/licenses/LICENSE +0 -0
- {codegraph_cli-2.1.0.dist-info → codegraph_cli-2.1.2.dist-info}/top_level.txt +0 -0
codegraph_cli/rag.py
CHANGED
|
@@ -3,13 +3,30 @@
|
|
|
3
3
|
Uses LanceDB hybrid search (vector + metadata filters) for fast,
|
|
4
4
|
accurate code retrieval. Falls back to brute-force cosine similarity
|
|
5
5
|
when the vector store is unavailable.
|
|
6
|
+
|
|
7
|
+
Improvements over the original implementation:
|
|
8
|
+
|
|
9
|
+
- **Cosine metric** — LanceDB searches use ``metric="cosine"`` so
|
|
10
|
+
``_distance`` values are true cosine distances (``1 − cos_sim``).
|
|
11
|
+
- **Minimum score threshold** — results below a configurable quality
|
|
12
|
+
floor are discarded before returning to callers.
|
|
13
|
+
- **Graph-neighbour augmentation** — after the initial semantic top-k,
|
|
14
|
+
direct dependency neighbours of the best hits are fetched from the
|
|
15
|
+
graph store and merged (de-duplicated) into the result set.
|
|
16
|
+
- **Result caching** — a small LRU dict avoids re-computing identical
|
|
17
|
+
queries within the same session.
|
|
18
|
+
- **Context compression** — :meth:`retrieve_context` strips import
|
|
19
|
+
lines, trims excessively long snippets, and formats structured
|
|
20
|
+
metadata so the LLM receives clean, information-dense context.
|
|
6
21
|
"""
|
|
7
22
|
|
|
8
23
|
from __future__ import annotations
|
|
9
24
|
|
|
10
25
|
import json
|
|
11
26
|
import logging
|
|
12
|
-
|
|
27
|
+
import re
|
|
28
|
+
from collections import OrderedDict
|
|
29
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
13
30
|
|
|
14
31
|
from .embeddings import HashEmbeddingModel, TransformerEmbedder, cosine_similarity
|
|
15
32
|
from .models import SearchResult
|
|
@@ -17,16 +34,33 @@ from .storage import GraphStore
|
|
|
17
34
|
|
|
18
35
|
logger = logging.getLogger(__name__)
|
|
19
36
|
|
|
37
|
+
# Minimum similarity score (0..1) below which results are dropped.
|
|
38
|
+
MIN_SCORE_THRESHOLD: float = 0.05
|
|
39
|
+
|
|
40
|
+
# Import line regex — used to strip bare imports from context snippets.
|
|
41
|
+
_IMPORT_RE = re.compile(r"^(?:from\s+\S+\s+)?import\s+.+\n?", re.MULTILINE)
|
|
42
|
+
|
|
43
|
+
# Max characters per snippet in formatted context output.
|
|
44
|
+
_MAX_SNIPPET_CHARS = 1000
|
|
45
|
+
|
|
46
|
+
# Max entries in the per-session query cache.
|
|
47
|
+
_CACHE_SIZE = 64
|
|
48
|
+
|
|
20
49
|
|
|
21
50
|
class RAGRetriever:
|
|
22
51
|
"""Retrieve relevant code nodes from graph memory via semantic similarity.
|
|
23
52
|
|
|
24
53
|
Supports two modes:
|
|
25
54
|
|
|
26
|
-
1. **Vector store mode** (fast, preferred)
|
|
27
|
-
``GraphStore
|
|
28
|
-
|
|
29
|
-
|
|
55
|
+
1. **Vector store mode** (fast, preferred) — delegates to a
|
|
56
|
+
**model-specific** LanceDB table via ``GraphStore``. Each
|
|
57
|
+
embedding model gets its own table so dimension mismatches
|
|
58
|
+
cannot occur. If no table exists for the current model, the
|
|
59
|
+
retriever automatically re-embeds all nodes from SQLite
|
|
60
|
+
(one-time, transparent to the caller).
|
|
61
|
+
2. **Brute-force mode** (fallback) — scans all SQLite rows and
|
|
62
|
+
computes cosine similarity in Python. Used only when LanceDB
|
|
63
|
+
is not installed at all.
|
|
30
64
|
|
|
31
65
|
The ``embedding_model`` argument accepts either a
|
|
32
66
|
:class:`~codegraph_cli.embeddings.TransformerEmbedder` or the lightweight
|
|
@@ -37,10 +71,89 @@ class RAGRetriever:
|
|
|
37
71
|
self,
|
|
38
72
|
store: GraphStore,
|
|
39
73
|
embedding_model: Union[TransformerEmbedder, HashEmbeddingModel, Any],
|
|
74
|
+
min_score: float = MIN_SCORE_THRESHOLD,
|
|
75
|
+
enable_graph_augment: bool = True,
|
|
40
76
|
) -> None:
|
|
41
77
|
self.store = store
|
|
42
78
|
self.embedding_model = embedding_model
|
|
43
|
-
self.
|
|
79
|
+
self.min_score = min_score
|
|
80
|
+
self.enable_graph_augment = enable_graph_augment
|
|
81
|
+
|
|
82
|
+
# Resolve the model-specific vector store
|
|
83
|
+
self._model_key: str = getattr(embedding_model, "model_key", "hash")
|
|
84
|
+
self._model_vs: Optional[Any] = None
|
|
85
|
+
self.use_vector_store: bool = False
|
|
86
|
+
self._init_model_vector_store()
|
|
87
|
+
|
|
88
|
+
# Simple LRU cache: query_text → List[SearchResult]
|
|
89
|
+
self._cache: OrderedDict[str, List[SearchResult]] = OrderedDict()
|
|
90
|
+
|
|
91
|
+
# ------------------------------------------------------------------
|
|
92
|
+
# Model-specific vector store initialisation
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
def _init_model_vector_store(self) -> None:
|
|
96
|
+
"""Obtain the LanceDB table for the current embedding model.
|
|
97
|
+
|
|
98
|
+
If the table doesn't exist or is empty, trigger a one-time
|
|
99
|
+
re-ingestion from SQLite so every model always has its own
|
|
100
|
+
properly-dimensioned vector store.
|
|
101
|
+
"""
|
|
102
|
+
self._model_vs = self.store.get_vector_store_for_model(self._model_key)
|
|
103
|
+
if self._model_vs is None:
|
|
104
|
+
# LanceDB not available — fall back to brute-force
|
|
105
|
+
self.use_vector_store = False
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
table_ready = (
|
|
109
|
+
getattr(self._model_vs, "_table", None) is not None
|
|
110
|
+
and self._model_vs.count() > 0
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if not table_ready:
|
|
114
|
+
# Table is empty / missing — auto re-ingest from SQLite
|
|
115
|
+
node_count = self.store.get_nodes()
|
|
116
|
+
if node_count:
|
|
117
|
+
logger.info(
|
|
118
|
+
"No vector table for model '%s' — re-ingesting %d nodes…",
|
|
119
|
+
self._model_key, len(node_count),
|
|
120
|
+
)
|
|
121
|
+
n = self.store.reingest_for_model(
|
|
122
|
+
self._model_key, self.embedding_model,
|
|
123
|
+
)
|
|
124
|
+
if n > 0:
|
|
125
|
+
# Refresh the reference after ingestion
|
|
126
|
+
self._model_vs = self.store.get_vector_store_for_model(
|
|
127
|
+
self._model_key,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self.use_vector_store = (
|
|
131
|
+
self._model_vs is not None
|
|
132
|
+
and getattr(self._model_vs, "_table", None) is not None
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# ------------------------------------------------------------------
|
|
136
|
+
# Cache helpers
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
def _cache_key(self, query: str, top_k: int, node_type: Optional[str]) -> str:
|
|
140
|
+
return f"{query}||{top_k}||{node_type or ''}"
|
|
141
|
+
|
|
142
|
+
def _cache_get(self, key: str) -> Optional[List[SearchResult]]:
|
|
143
|
+
if key in self._cache:
|
|
144
|
+
self._cache.move_to_end(key)
|
|
145
|
+
return self._cache[key]
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
def _cache_put(self, key: str, value: List[SearchResult]) -> None:
|
|
149
|
+
self._cache[key] = value
|
|
150
|
+
self._cache.move_to_end(key)
|
|
151
|
+
while len(self._cache) > _CACHE_SIZE:
|
|
152
|
+
self._cache.popitem(last=False)
|
|
153
|
+
|
|
154
|
+
def clear_cache(self) -> None:
|
|
155
|
+
"""Flush the query result cache."""
|
|
156
|
+
self._cache.clear()
|
|
44
157
|
|
|
45
158
|
# ------------------------------------------------------------------
|
|
46
159
|
# Primary search
|
|
@@ -65,13 +178,39 @@ class RAGRetriever:
|
|
|
65
178
|
Returns:
|
|
66
179
|
List of :class:`SearchResult` sorted by relevance (highest first).
|
|
67
180
|
"""
|
|
181
|
+
ck = self._cache_key(query, top_k, node_type)
|
|
182
|
+
cached = self._cache_get(ck)
|
|
183
|
+
if cached is not None:
|
|
184
|
+
return cached
|
|
185
|
+
|
|
68
186
|
query_emb: List[float] = self.embedding_model.embed_text(query)
|
|
69
187
|
|
|
70
188
|
if self.use_vector_store:
|
|
71
|
-
|
|
189
|
+
results = self._search_vector_store(
|
|
72
190
|
query_emb, top_k, node_type, file_filter,
|
|
73
191
|
)
|
|
74
|
-
|
|
192
|
+
# Fall back to brute-force if the vector store returned nothing
|
|
193
|
+
# (e.g. empty table, dimension mismatch, or LanceDB error).
|
|
194
|
+
if not results:
|
|
195
|
+
results = self._search_brute_force(query_emb, top_k, node_type)
|
|
196
|
+
else:
|
|
197
|
+
results = self._search_brute_force(query_emb, top_k, node_type)
|
|
198
|
+
|
|
199
|
+
# ── Graph-neighbour augmentation ────────────────────────
|
|
200
|
+
if self.enable_graph_augment and results:
|
|
201
|
+
results = self._augment_with_graph_neighbours(
|
|
202
|
+
results, query_emb, top_k,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# ── Minimum-score gate ──────────────────────────────────
|
|
206
|
+
results = [r for r in results if r.score >= self.min_score]
|
|
207
|
+
|
|
208
|
+
# ── Final sort & trim ───────────────────────────────────
|
|
209
|
+
results.sort(key=lambda r: r.score, reverse=True)
|
|
210
|
+
results = results[:top_k]
|
|
211
|
+
|
|
212
|
+
self._cache_put(ck, results)
|
|
213
|
+
return results
|
|
75
214
|
|
|
76
215
|
# ------------------------------------------------------------------
|
|
77
216
|
# LanceDB path (fast)
|
|
@@ -84,7 +223,7 @@ class RAGRetriever:
|
|
|
84
223
|
node_type: Optional[str],
|
|
85
224
|
file_filter: Optional[str],
|
|
86
225
|
) -> List[SearchResult]:
|
|
87
|
-
assert self.
|
|
226
|
+
assert self._model_vs is not None
|
|
88
227
|
|
|
89
228
|
# Build SQL WHERE clause for hybrid search
|
|
90
229
|
clauses: List[str] = []
|
|
@@ -94,7 +233,7 @@ class RAGRetriever:
|
|
|
94
233
|
clauses.append(f'file_path LIKE "{file_filter}"')
|
|
95
234
|
where_sql = " AND ".join(clauses) if clauses else None
|
|
96
235
|
|
|
97
|
-
raw_results = self.
|
|
236
|
+
raw_results = self._model_vs.hybrid_search(
|
|
98
237
|
query_embedding=query_emb,
|
|
99
238
|
n_results=top_k,
|
|
100
239
|
where_sql=where_sql,
|
|
@@ -103,10 +242,9 @@ class RAGRetriever:
|
|
|
103
242
|
results: List[SearchResult] = []
|
|
104
243
|
for row in raw_results:
|
|
105
244
|
distance = row.get("_distance", 0.0)
|
|
106
|
-
#
|
|
107
|
-
#
|
|
108
|
-
|
|
109
|
-
score = max(0.0, 1.0 - distance)
|
|
245
|
+
# With cosine metric, _distance is cosine distance ∈ [0, 2].
|
|
246
|
+
# Similarity = 1 − distance, clamped to [0, 1].
|
|
247
|
+
score = max(0.0, min(1.0, 1.0 - distance))
|
|
110
248
|
|
|
111
249
|
# Enrich from SQLite if full node data is needed
|
|
112
250
|
node_row = self.store.get_node(row.get("id", ""))
|
|
@@ -148,12 +286,18 @@ class RAGRetriever:
|
|
|
148
286
|
node_type: Optional[str],
|
|
149
287
|
) -> List[SearchResult]:
|
|
150
288
|
results: List[SearchResult] = []
|
|
289
|
+
query_dim = len(query_emb)
|
|
151
290
|
for row in self.store.get_nodes():
|
|
152
291
|
if node_type and row["node_type"] != node_type:
|
|
153
292
|
continue
|
|
154
293
|
embedding = json.loads(row["embedding"] or "[]")
|
|
294
|
+
if not embedding:
|
|
295
|
+
continue
|
|
296
|
+
# Skip rows whose stored embedding dimension doesn't match
|
|
297
|
+
if len(embedding) != query_dim:
|
|
298
|
+
continue
|
|
155
299
|
score = cosine_similarity(query_emb, embedding)
|
|
156
|
-
if score
|
|
300
|
+
if score < self.min_score:
|
|
157
301
|
continue
|
|
158
302
|
results.append(SearchResult(
|
|
159
303
|
node_id=row["node_id"],
|
|
@@ -168,6 +312,77 @@ class RAGRetriever:
|
|
|
168
312
|
|
|
169
313
|
return sorted(results, key=lambda r: r.score, reverse=True)[:top_k]
|
|
170
314
|
|
|
315
|
+
# ------------------------------------------------------------------
|
|
316
|
+
# Graph-neighbour augmentation
|
|
317
|
+
# ------------------------------------------------------------------
|
|
318
|
+
|
|
319
|
+
def _augment_with_graph_neighbours(
|
|
320
|
+
self,
|
|
321
|
+
results: List[SearchResult],
|
|
322
|
+
query_emb: List[float],
|
|
323
|
+
max_total: int,
|
|
324
|
+
) -> List[SearchResult]:
|
|
325
|
+
"""Expand the result set with direct dependency neighbours.
|
|
326
|
+
|
|
327
|
+
For the top-3 semantic hits, fetch their outgoing and incoming
|
|
328
|
+
graph edges and score the neighbour nodes against the query.
|
|
329
|
+
Merge into *results* (deduplicated by ``node_id``).
|
|
330
|
+
"""
|
|
331
|
+
seen_ids: Set[str] = {r.node_id for r in results}
|
|
332
|
+
extra: List[SearchResult] = []
|
|
333
|
+
|
|
334
|
+
# Only augment from the best 3 hits to keep it fast
|
|
335
|
+
for sr in results[:3]:
|
|
336
|
+
for edge in self.store.neighbors(sr.node_id):
|
|
337
|
+
dst_id = edge["dst"]
|
|
338
|
+
if dst_id in seen_ids:
|
|
339
|
+
continue
|
|
340
|
+
seen_ids.add(dst_id)
|
|
341
|
+
node_row = self.store.get_node(dst_id)
|
|
342
|
+
if node_row is None:
|
|
343
|
+
continue
|
|
344
|
+
emb = json.loads(node_row["embedding"] or "[]")
|
|
345
|
+
if emb:
|
|
346
|
+
score = cosine_similarity(query_emb, emb)
|
|
347
|
+
else:
|
|
348
|
+
score = sr.score * 0.3
|
|
349
|
+
extra.append(SearchResult(
|
|
350
|
+
node_id=node_row["node_id"],
|
|
351
|
+
score=score,
|
|
352
|
+
node_type=node_row["node_type"],
|
|
353
|
+
qualname=node_row["qualname"],
|
|
354
|
+
file_path=node_row["file_path"],
|
|
355
|
+
start_line=node_row["start_line"],
|
|
356
|
+
end_line=node_row["end_line"],
|
|
357
|
+
snippet=node_row["code"],
|
|
358
|
+
))
|
|
359
|
+
|
|
360
|
+
for edge in self.store.reverse_neighbors(sr.node_id):
|
|
361
|
+
src_id = edge["src"]
|
|
362
|
+
if src_id in seen_ids:
|
|
363
|
+
continue
|
|
364
|
+
seen_ids.add(src_id)
|
|
365
|
+
node_row = self.store.get_node(src_id)
|
|
366
|
+
if node_row is None:
|
|
367
|
+
continue
|
|
368
|
+
emb = json.loads(node_row["embedding"] or "[]")
|
|
369
|
+
if emb:
|
|
370
|
+
score = cosine_similarity(query_emb, emb)
|
|
371
|
+
else:
|
|
372
|
+
score = sr.score * 0.3
|
|
373
|
+
extra.append(SearchResult(
|
|
374
|
+
node_id=node_row["node_id"],
|
|
375
|
+
score=score,
|
|
376
|
+
node_type=node_row["node_type"],
|
|
377
|
+
qualname=node_row["qualname"],
|
|
378
|
+
file_path=node_row["file_path"],
|
|
379
|
+
start_line=node_row["start_line"],
|
|
380
|
+
end_line=node_row["end_line"],
|
|
381
|
+
snippet=node_row["code"],
|
|
382
|
+
))
|
|
383
|
+
|
|
384
|
+
return results + extra
|
|
385
|
+
|
|
171
386
|
# ------------------------------------------------------------------
|
|
172
387
|
# Convenience
|
|
173
388
|
# ------------------------------------------------------------------
|
|
@@ -182,6 +397,8 @@ class RAGRetriever:
|
|
|
182
397
|
"""Return a formatted string of the top search results.
|
|
183
398
|
|
|
184
399
|
Useful for injecting relevant code context into LLM prompts.
|
|
400
|
+
Applies context compression: strips imports, trims long code,
|
|
401
|
+
and formats structured metadata.
|
|
185
402
|
"""
|
|
186
403
|
matches = self.search(
|
|
187
404
|
query, top_k=top_k, node_type=node_type, file_filter=file_filter,
|
|
@@ -191,10 +408,56 @@ class RAGRetriever:
|
|
|
191
408
|
|
|
192
409
|
blocks: List[str] = []
|
|
193
410
|
for item in matches:
|
|
411
|
+
snippet = _compress_snippet(item.snippet)
|
|
194
412
|
blocks.append(
|
|
195
|
-
f"[{item.node_type}] {item.qualname}
|
|
196
|
-
f"
|
|
197
|
-
f"
|
|
198
|
-
f"```python\n{
|
|
413
|
+
f"[{item.node_type}] {item.qualname}\n"
|
|
414
|
+
f"file: {item.file_path}:{item.start_line}\n"
|
|
415
|
+
f"score: {item.score:.3f}\n"
|
|
416
|
+
f"```python\n{snippet}\n```"
|
|
199
417
|
)
|
|
200
418
|
return "\n\n".join(blocks)
|
|
419
|
+
|
|
420
|
+
# ------------------------------------------------------------------
|
|
421
|
+
# Debug helper
|
|
422
|
+
# ------------------------------------------------------------------
|
|
423
|
+
|
|
424
|
+
def debug_search(
|
|
425
|
+
self,
|
|
426
|
+
query: str,
|
|
427
|
+
top_k: int = 10,
|
|
428
|
+
) -> List[Dict[str, Any]]:
|
|
429
|
+
"""Diagnostic search — returns raw dicts with full scoring info.
|
|
430
|
+
|
|
431
|
+
Intended for ``cg debug-rag`` CLI command.
|
|
432
|
+
"""
|
|
433
|
+
results = self.search(query, top_k=top_k)
|
|
434
|
+
out: List[Dict[str, Any]] = []
|
|
435
|
+
for r in results:
|
|
436
|
+
out.append({
|
|
437
|
+
"node_id": r.node_id,
|
|
438
|
+
"qualname": r.qualname,
|
|
439
|
+
"node_type": r.node_type,
|
|
440
|
+
"file_path": r.file_path,
|
|
441
|
+
"score": round(r.score, 5),
|
|
442
|
+
"lines": f"{r.start_line}-{r.end_line}",
|
|
443
|
+
"snippet_len": len(r.snippet),
|
|
444
|
+
})
|
|
445
|
+
return out
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
# ===================================================================
|
|
449
|
+
# Context compression utilities
|
|
450
|
+
# ===================================================================
|
|
451
|
+
|
|
452
|
+
def _compress_snippet(code: str, max_chars: int = _MAX_SNIPPET_CHARS) -> str:
|
|
453
|
+
"""Clean and truncate a code snippet for LLM context.
|
|
454
|
+
|
|
455
|
+
1. Strip bare import lines (the LLM rarely needs them).
|
|
456
|
+
2. Collapse runs of blank lines.
|
|
457
|
+
3. Truncate to *max_chars*.
|
|
458
|
+
"""
|
|
459
|
+
code = _IMPORT_RE.sub("", code)
|
|
460
|
+
code = re.sub(r"\n{3,}", "\n\n", code).strip()
|
|
461
|
+
if len(code) > max_chars:
|
|
462
|
+
code = code[:max_chars] + "\n# ... (truncated)"
|
|
463
|
+
return code
|