codegraph-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_cli/__init__.py +4 -0
- codegraph_cli/agents.py +191 -0
- codegraph_cli/bug_detector.py +386 -0
- codegraph_cli/chat_agent.py +352 -0
- codegraph_cli/chat_session.py +220 -0
- codegraph_cli/cli.py +330 -0
- codegraph_cli/cli_chat.py +367 -0
- codegraph_cli/cli_diagnose.py +133 -0
- codegraph_cli/cli_refactor.py +230 -0
- codegraph_cli/cli_setup.py +470 -0
- codegraph_cli/cli_test.py +177 -0
- codegraph_cli/cli_v2.py +267 -0
- codegraph_cli/codegen_agent.py +265 -0
- codegraph_cli/config.py +31 -0
- codegraph_cli/config_manager.py +341 -0
- codegraph_cli/context_manager.py +500 -0
- codegraph_cli/crew_agents.py +123 -0
- codegraph_cli/crew_chat.py +159 -0
- codegraph_cli/crew_tools.py +497 -0
- codegraph_cli/diff_engine.py +265 -0
- codegraph_cli/embeddings.py +241 -0
- codegraph_cli/graph_export.py +144 -0
- codegraph_cli/llm.py +642 -0
- codegraph_cli/models.py +47 -0
- codegraph_cli/models_v2.py +185 -0
- codegraph_cli/orchestrator.py +49 -0
- codegraph_cli/parser.py +800 -0
- codegraph_cli/performance_analyzer.py +223 -0
- codegraph_cli/project_context.py +230 -0
- codegraph_cli/rag.py +200 -0
- codegraph_cli/refactor_agent.py +452 -0
- codegraph_cli/security_scanner.py +366 -0
- codegraph_cli/storage.py +390 -0
- codegraph_cli/templates/graph_interactive.html +257 -0
- codegraph_cli/testgen_agent.py +316 -0
- codegraph_cli/validation_engine.py +285 -0
- codegraph_cli/vector_store.py +293 -0
- codegraph_cli-2.0.0.dist-info/METADATA +318 -0
- codegraph_cli-2.0.0.dist-info/RECORD +43 -0
- codegraph_cli-2.0.0.dist-info/WHEEL +5 -0
- codegraph_cli-2.0.0.dist-info/entry_points.txt +2 -0
- codegraph_cli-2.0.0.dist-info/licenses/LICENSE +21 -0
- codegraph_cli-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""Performance issue analyzer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ast
|
|
6
|
+
from typing import Dict, List, Set
|
|
7
|
+
|
|
8
|
+
from .storage import GraphStore
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PerformanceAnalyzer:
|
|
12
|
+
"""Analyze code for performance issues."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, store: GraphStore):
|
|
15
|
+
self.store = store
|
|
16
|
+
|
|
17
|
+
# Database-related method names that indicate queries
|
|
18
|
+
self.query_methods = {
|
|
19
|
+
"execute", "executemany", "query", "get", "filter",
|
|
20
|
+
"all", "first", "one", "fetch", "fetchone", "fetchall",
|
|
21
|
+
"select", "insert", "update", "delete"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
def analyze_file(self, file_path: str) -> List[Dict]:
|
|
25
|
+
"""Analyze file for performance issues.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
file_path: Path to file to analyze
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of performance issue dictionaries
|
|
32
|
+
"""
|
|
33
|
+
issues = []
|
|
34
|
+
|
|
35
|
+
nodes = [n for n in self.store.get_nodes() if n["file_path"] == file_path]
|
|
36
|
+
# Skip module-level node when function/class nodes exist to avoid dupes
|
|
37
|
+
has_children = any(n["node_type"] != "module" for n in nodes)
|
|
38
|
+
if has_children:
|
|
39
|
+
nodes = [n for n in nodes if n["node_type"] != "module"]
|
|
40
|
+
|
|
41
|
+
seen: set = set() # (line, type) dedup
|
|
42
|
+
|
|
43
|
+
for node in nodes:
|
|
44
|
+
try:
|
|
45
|
+
tree = ast.parse(node["code"])
|
|
46
|
+
except SyntaxError:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
for issue in (
|
|
50
|
+
self._detect_n_plus_one(tree, node)
|
|
51
|
+
+ self._detect_inefficient_algorithms(tree, node)
|
|
52
|
+
+ self._detect_memory_issues(tree, node)
|
|
53
|
+
):
|
|
54
|
+
key = (issue["line"], issue["type"])
|
|
55
|
+
if key not in seen:
|
|
56
|
+
seen.add(key)
|
|
57
|
+
issues.append(issue)
|
|
58
|
+
|
|
59
|
+
return issues
|
|
60
|
+
|
|
61
|
+
def _detect_n_plus_one(self, tree: ast.AST, node: Dict) -> List[Dict]:
|
|
62
|
+
"""Detect N+1 query patterns."""
|
|
63
|
+
issues = []
|
|
64
|
+
|
|
65
|
+
for ast_node in ast.walk(tree):
|
|
66
|
+
# Look for loops with database queries inside
|
|
67
|
+
if isinstance(ast_node, (ast.For, ast.While)):
|
|
68
|
+
# Check for query calls inside the loop
|
|
69
|
+
for inner_node in ast.walk(ast_node):
|
|
70
|
+
if isinstance(inner_node, ast.Call):
|
|
71
|
+
# Check for query-like method names
|
|
72
|
+
if isinstance(inner_node.func, ast.Attribute):
|
|
73
|
+
if inner_node.func.attr in self.query_methods:
|
|
74
|
+
issues.append({
|
|
75
|
+
"type": "n_plus_one_query",
|
|
76
|
+
"severity": "high",
|
|
77
|
+
"line": node["start_line"] + inner_node.lineno - 1,
|
|
78
|
+
"message": "Potential N+1 query pattern: database query inside loop",
|
|
79
|
+
"suggestion": "Use bulk queries, eager loading, or prefetch_related()",
|
|
80
|
+
"code_snippet": ast.unparse(inner_node)[:100]
|
|
81
|
+
})
|
|
82
|
+
break # Only report once per loop
|
|
83
|
+
|
|
84
|
+
return issues
|
|
85
|
+
|
|
86
|
+
def _detect_inefficient_algorithms(self, tree: ast.AST, node: Dict) -> List[Dict]:
|
|
87
|
+
"""Detect inefficient algorithm patterns."""
|
|
88
|
+
issues = []
|
|
89
|
+
|
|
90
|
+
# Track nested loops for O(n²) detection
|
|
91
|
+
loop_depth = {}
|
|
92
|
+
|
|
93
|
+
for ast_node in ast.walk(tree):
|
|
94
|
+
if isinstance(ast_node, (ast.For, ast.While)):
|
|
95
|
+
# Check for nested loops
|
|
96
|
+
nested_loops = []
|
|
97
|
+
for inner in ast.walk(ast_node):
|
|
98
|
+
if inner != ast_node and isinstance(inner, (ast.For, ast.While)):
|
|
99
|
+
nested_loops.append(inner)
|
|
100
|
+
|
|
101
|
+
if nested_loops:
|
|
102
|
+
# Report the innermost nested loop
|
|
103
|
+
innermost = nested_loops[0]
|
|
104
|
+
issues.append({
|
|
105
|
+
"type": "nested_loop",
|
|
106
|
+
"severity": "medium",
|
|
107
|
+
"line": node["start_line"] + innermost.lineno - 1,
|
|
108
|
+
"message": "Nested loop detected (O(n²) complexity)",
|
|
109
|
+
"suggestion": "Consider using hash maps, sets, or optimizing algorithm",
|
|
110
|
+
"code_snippet": f"for ... in ...: for ... in ..."
|
|
111
|
+
})
|
|
112
|
+
|
|
113
|
+
# Detect list operations in loops (inefficient)
|
|
114
|
+
if isinstance(ast_node, ast.For):
|
|
115
|
+
for inner in ast.walk(ast_node):
|
|
116
|
+
# Check for list.append in loop with large iterations
|
|
117
|
+
if isinstance(inner, ast.Call):
|
|
118
|
+
if isinstance(inner.func, ast.Attribute):
|
|
119
|
+
# Detect repeated string concatenation
|
|
120
|
+
if inner.func.attr == "append":
|
|
121
|
+
# Check if it's appending to a list that's later joined
|
|
122
|
+
# This is actually efficient, so skip
|
|
123
|
+
pass
|
|
124
|
+
|
|
125
|
+
# Detect list.insert(0, ...) which is O(n)
|
|
126
|
+
elif inner.func.attr == "insert":
|
|
127
|
+
if inner.args and isinstance(inner.args[0], ast.Constant):
|
|
128
|
+
if inner.args[0].value == 0:
|
|
129
|
+
issues.append({
|
|
130
|
+
"type": "inefficient_operation",
|
|
131
|
+
"severity": "medium",
|
|
132
|
+
"line": node["start_line"] + inner.lineno - 1,
|
|
133
|
+
"message": "list.insert(0, ...) in loop is O(n²)",
|
|
134
|
+
"suggestion": "Use collections.deque or append and reverse",
|
|
135
|
+
"code_snippet": ast.unparse(inner)[:100]
|
|
136
|
+
})
|
|
137
|
+
|
|
138
|
+
# Detect string concatenation in loops
|
|
139
|
+
for ast_node in ast.walk(tree):
|
|
140
|
+
if isinstance(ast_node, (ast.For, ast.While)):
|
|
141
|
+
for inner in ast.walk(ast_node):
|
|
142
|
+
# Look for += on strings
|
|
143
|
+
if isinstance(inner, ast.AugAssign):
|
|
144
|
+
if isinstance(inner.op, ast.Add):
|
|
145
|
+
# Check if target is likely a string
|
|
146
|
+
if isinstance(inner.target, ast.Name):
|
|
147
|
+
issues.append({
|
|
148
|
+
"type": "string_concatenation_loop",
|
|
149
|
+
"severity": "low",
|
|
150
|
+
"line": node["start_line"] + inner.lineno - 1,
|
|
151
|
+
"message": "String concatenation in loop (inefficient)",
|
|
152
|
+
"suggestion": "Use list.append() and ''.join() instead",
|
|
153
|
+
"code_snippet": ast.unparse(inner)[:100]
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
return issues
|
|
157
|
+
|
|
158
|
+
def _detect_memory_issues(self, tree: ast.AST, node: Dict) -> List[Dict]:
|
|
159
|
+
"""Detect memory inefficiencies."""
|
|
160
|
+
issues = []
|
|
161
|
+
seen_lines: set = set()
|
|
162
|
+
|
|
163
|
+
for ast_node in ast.walk(tree):
|
|
164
|
+
report_line = node["start_line"] + ast_node.lineno - 1 if hasattr(ast_node, 'lineno') else None
|
|
165
|
+
|
|
166
|
+
# Detect large list comprehensions that could be generators
|
|
167
|
+
if isinstance(ast_node, ast.ListComp):
|
|
168
|
+
if report_line and report_line not in seen_lines:
|
|
169
|
+
seen_lines.add(report_line)
|
|
170
|
+
issues.append({
|
|
171
|
+
"type": "memory_inefficiency",
|
|
172
|
+
"severity": "low",
|
|
173
|
+
"line": report_line,
|
|
174
|
+
"message": "List comprehension could be a generator expression",
|
|
175
|
+
"suggestion": "Use (...) instead of [...] if you only iterate once",
|
|
176
|
+
"code_snippet": ast.unparse(ast_node)[:100]
|
|
177
|
+
})
|
|
178
|
+
|
|
179
|
+
# Detect reading entire file into memory
|
|
180
|
+
if isinstance(ast_node, ast.Call):
|
|
181
|
+
if isinstance(ast_node.func, ast.Attribute):
|
|
182
|
+
# file.read() without size limit
|
|
183
|
+
if ast_node.func.attr == "read" and not ast_node.args:
|
|
184
|
+
issues.append({
|
|
185
|
+
"type": "memory_inefficiency",
|
|
186
|
+
"severity": "medium",
|
|
187
|
+
"line": node["start_line"] + ast_node.lineno - 1,
|
|
188
|
+
"message": "Reading entire file into memory",
|
|
189
|
+
"suggestion": "Read in chunks or iterate line by line",
|
|
190
|
+
"code_snippet": ast.unparse(ast_node)[:100]
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
# .readlines() also loads entire file
|
|
194
|
+
elif ast_node.func.attr == "readlines":
|
|
195
|
+
issues.append({
|
|
196
|
+
"type": "memory_inefficiency",
|
|
197
|
+
"severity": "medium",
|
|
198
|
+
"line": node["start_line"] + ast_node.lineno - 1,
|
|
199
|
+
"message": "readlines() loads entire file into memory",
|
|
200
|
+
"suggestion": "Iterate over file object directly: for line in file:",
|
|
201
|
+
"code_snippet": ast.unparse(ast_node)[:100]
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
return issues
|
|
205
|
+
|
|
206
|
+
def analyze_project(self) -> Dict[str, List[Dict]]:
|
|
207
|
+
"""Analyze entire project for performance issues.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Dictionary mapping file paths to lists of performance issues
|
|
211
|
+
"""
|
|
212
|
+
results = {}
|
|
213
|
+
|
|
214
|
+
# Get all unique file paths
|
|
215
|
+
all_nodes = self.store.get_nodes()
|
|
216
|
+
file_paths = set(node["file_path"] for node in all_nodes)
|
|
217
|
+
|
|
218
|
+
for file_path in file_paths:
|
|
219
|
+
issues = self.analyze_file(file_path)
|
|
220
|
+
if issues:
|
|
221
|
+
results[file_path] = issues
|
|
222
|
+
|
|
223
|
+
return results
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""Project context manager with real file system access."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
from .storage import GraphStore, ProjectManager
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ProjectContext:
|
|
13
|
+
"""Unified context for project with real file access and code graph."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, project_name: str, project_manager: ProjectManager):
|
|
16
|
+
"""Initialize project context.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
project_name: Name of the project
|
|
20
|
+
project_manager: ProjectManager instance
|
|
21
|
+
"""
|
|
22
|
+
self.project_name = project_name
|
|
23
|
+
self.project_manager = project_manager
|
|
24
|
+
self.project_dir = project_manager.project_dir(project_name)
|
|
25
|
+
self.store = GraphStore(self.project_dir)
|
|
26
|
+
self.metadata = self.store.get_metadata()
|
|
27
|
+
|
|
28
|
+
# Get source path from metadata
|
|
29
|
+
source_path_str = self.metadata.get("source_path")
|
|
30
|
+
if source_path_str:
|
|
31
|
+
self.source_path = Path(source_path_str)
|
|
32
|
+
if not self.source_path.exists():
|
|
33
|
+
# Source path moved or deleted
|
|
34
|
+
self.source_path = None
|
|
35
|
+
else:
|
|
36
|
+
# Old project without source_path
|
|
37
|
+
self.source_path = None
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def has_source_access(self) -> bool:
|
|
41
|
+
"""Check if we have access to the original source directory."""
|
|
42
|
+
return self.source_path is not None and self.source_path.exists()
|
|
43
|
+
|
|
44
|
+
# File System Operations
|
|
45
|
+
|
|
46
|
+
def list_directory(self, rel_path: str = ".") -> List[Dict]:
|
|
47
|
+
"""List files and directories in project.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
rel_path: Relative path from project root
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of file/directory info dicts
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
RuntimeError: If source path not available
|
|
57
|
+
"""
|
|
58
|
+
if not self.has_source_access:
|
|
59
|
+
raise RuntimeError(
|
|
60
|
+
f"Source path not available for project '{self.project_name}'. "
|
|
61
|
+
"Re-index the project to enable file access."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
full_path = self.source_path / rel_path
|
|
65
|
+
if not full_path.exists():
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
if not full_path.is_dir():
|
|
69
|
+
raise ValueError(f"Path is not a directory: {rel_path}")
|
|
70
|
+
|
|
71
|
+
items = []
|
|
72
|
+
for item in sorted(full_path.iterdir()):
|
|
73
|
+
try:
|
|
74
|
+
stat = item.stat()
|
|
75
|
+
items.append({
|
|
76
|
+
"name": item.name,
|
|
77
|
+
"type": "dir" if item.is_dir() else "file",
|
|
78
|
+
"size": stat.st_size if item.is_file() else None,
|
|
79
|
+
"path": str(item.relative_to(self.source_path)),
|
|
80
|
+
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat()
|
|
81
|
+
})
|
|
82
|
+
except (OSError, PermissionError):
|
|
83
|
+
# Skip files we can't access
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
return items
|
|
87
|
+
|
|
88
|
+
def read_file(self, rel_path: str) -> str:
|
|
89
|
+
"""Read file contents from project.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
rel_path: Relative path to file
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
File contents as string
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
RuntimeError: If source path not available
|
|
99
|
+
FileNotFoundError: If file doesn't exist
|
|
100
|
+
"""
|
|
101
|
+
if not self.has_source_access:
|
|
102
|
+
raise RuntimeError(
|
|
103
|
+
f"Source path not available for project '{self.project_name}'. "
|
|
104
|
+
"Re-index the project to enable file access."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
full_path = self.source_path / rel_path
|
|
108
|
+
if not full_path.exists():
|
|
109
|
+
raise FileNotFoundError(f"File not found: {rel_path}")
|
|
110
|
+
|
|
111
|
+
if not full_path.is_file():
|
|
112
|
+
raise ValueError(f"Path is not a file: {rel_path}")
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
return full_path.read_text(encoding="utf-8")
|
|
116
|
+
except UnicodeDecodeError:
|
|
117
|
+
# Try reading as binary for non-text files
|
|
118
|
+
return f"[Binary file: {full_path.suffix}]"
|
|
119
|
+
|
|
120
|
+
def write_file(self, rel_path: str, content: str, create_dirs: bool = True) -> bool:
|
|
121
|
+
"""Write or create a file in the project.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
rel_path: Relative path to file
|
|
125
|
+
content: File content
|
|
126
|
+
create_dirs: Whether to create parent directories
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
True if successful
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
RuntimeError: If source path not available
|
|
133
|
+
"""
|
|
134
|
+
if not self.has_source_access:
|
|
135
|
+
raise RuntimeError(
|
|
136
|
+
f"Source path not available for project '{self.project_name}'. "
|
|
137
|
+
"Re-index the project to enable file access."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
full_path = self.source_path / rel_path
|
|
141
|
+
|
|
142
|
+
if create_dirs:
|
|
143
|
+
full_path.parent.mkdir(parents=True, exist_ok=True)
|
|
144
|
+
|
|
145
|
+
full_path.write_text(content, encoding="utf-8")
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
def file_exists(self, rel_path: str) -> bool:
|
|
149
|
+
"""Check if a file exists in the project.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
rel_path: Relative path to file
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
True if file exists
|
|
156
|
+
"""
|
|
157
|
+
if not self.has_source_access:
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
return (self.source_path / rel_path).exists()
|
|
161
|
+
|
|
162
|
+
def get_file_info(self, rel_path: str) -> Optional[Dict]:
|
|
163
|
+
"""Get information about a file.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
rel_path: Relative path to file
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
File info dict or None if not found
|
|
170
|
+
"""
|
|
171
|
+
if not self.has_source_access:
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
full_path = self.source_path / rel_path
|
|
175
|
+
if not full_path.exists():
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
stat = full_path.stat()
|
|
180
|
+
return {
|
|
181
|
+
"name": full_path.name,
|
|
182
|
+
"path": rel_path,
|
|
183
|
+
"type": "dir" if full_path.is_dir() else "file",
|
|
184
|
+
"size": stat.st_size if full_path.is_file() else None,
|
|
185
|
+
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
|
186
|
+
"created": datetime.fromtimestamp(stat.st_ctime).isoformat()
|
|
187
|
+
}
|
|
188
|
+
except (OSError, PermissionError):
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
# Code Graph Operations
|
|
192
|
+
|
|
193
|
+
def get_indexed_files(self) -> List[str]:
|
|
194
|
+
"""Get list of indexed Python files.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
List of file paths that were indexed
|
|
198
|
+
"""
|
|
199
|
+
nodes = self.store.get_nodes()
|
|
200
|
+
return sorted(set(node["file_path"] for node in nodes))
|
|
201
|
+
|
|
202
|
+
def get_project_summary(self) -> Dict:
|
|
203
|
+
"""Get summary of project.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Dictionary with project statistics
|
|
207
|
+
"""
|
|
208
|
+
nodes = self.store.get_nodes()
|
|
209
|
+
edges = self.store.get_edges()
|
|
210
|
+
|
|
211
|
+
# Count by type
|
|
212
|
+
node_types = {}
|
|
213
|
+
for node in nodes:
|
|
214
|
+
node_type = node["node_type"]
|
|
215
|
+
node_types[node_type] = node_types.get(node_type, 0) + 1
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
"project_name": self.project_name,
|
|
219
|
+
"source_path": str(self.source_path) if self.source_path else None,
|
|
220
|
+
"has_source_access": self.has_source_access,
|
|
221
|
+
"indexed_at": self.metadata.get("indexed_at"),
|
|
222
|
+
"total_nodes": len(nodes),
|
|
223
|
+
"total_edges": len(edges),
|
|
224
|
+
"node_types": node_types,
|
|
225
|
+
"indexed_files": len(self.get_indexed_files())
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
def close(self):
|
|
229
|
+
"""Close the graph store connection."""
|
|
230
|
+
self.store.close()
|
codegraph_cli/rag.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Retrieval-Augmented components for semantic code search.
|
|
2
|
+
|
|
3
|
+
Uses LanceDB hybrid search (vector + metadata filters) for fast,
|
|
4
|
+
accurate code retrieval. Falls back to brute-force cosine similarity
|
|
5
|
+
when the vector store is unavailable.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Dict, List, Optional, Union
|
|
13
|
+
|
|
14
|
+
from .embeddings import HashEmbeddingModel, NeuralEmbedder, cosine_similarity
|
|
15
|
+
from .models import SearchResult
|
|
16
|
+
from .storage import GraphStore
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RAGRetriever:
|
|
22
|
+
"""Retrieve relevant code nodes from graph memory via semantic similarity.
|
|
23
|
+
|
|
24
|
+
Supports two modes:
|
|
25
|
+
|
|
26
|
+
1. **Vector store mode** (fast, preferred) – delegates to LanceDB via
|
|
27
|
+
``GraphStore.vector_store``.
|
|
28
|
+
2. **Brute-force mode** (fallback) – scans all SQLite rows and computes
|
|
29
|
+
cosine similarity in Python.
|
|
30
|
+
|
|
31
|
+
The ``embedding_model`` argument accepts either a
|
|
32
|
+
:class:`~codegraph_cli.embeddings.NeuralEmbedder` or the legacy
|
|
33
|
+
:class:`~codegraph_cli.embeddings.HashEmbeddingModel`.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
store: GraphStore,
|
|
39
|
+
embedding_model: Union[NeuralEmbedder, HashEmbeddingModel, Any],
|
|
40
|
+
) -> None:
|
|
41
|
+
self.store = store
|
|
42
|
+
self.embedding_model = embedding_model
|
|
43
|
+
self.use_vector_store: bool = store.vector_store is not None
|
|
44
|
+
|
|
45
|
+
# ------------------------------------------------------------------
|
|
46
|
+
# Primary search
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
def search(
|
|
50
|
+
self,
|
|
51
|
+
query: str,
|
|
52
|
+
top_k: int = 5,
|
|
53
|
+
node_type: Optional[str] = None,
|
|
54
|
+
file_filter: Optional[str] = None,
|
|
55
|
+
) -> List[SearchResult]:
|
|
56
|
+
"""Semantic search for code nodes.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
query: Natural-language or code query.
|
|
60
|
+
top_k: Number of results.
|
|
61
|
+
node_type: Optional filter (``function``, ``class``, ``module``).
|
|
62
|
+
file_filter: Optional file-path SQL pattern,
|
|
63
|
+
e.g. ``"src/%"`` to restrict to files under *src/*.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of :class:`SearchResult` sorted by relevance (highest first).
|
|
67
|
+
"""
|
|
68
|
+
query_emb: List[float] = self.embedding_model.embed_text(query)
|
|
69
|
+
|
|
70
|
+
if self.use_vector_store:
|
|
71
|
+
return self._search_vector_store(
|
|
72
|
+
query_emb, top_k, node_type, file_filter,
|
|
73
|
+
)
|
|
74
|
+
return self._search_brute_force(query_emb, top_k, node_type)
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# LanceDB path (fast)
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
def _search_vector_store(
|
|
81
|
+
self,
|
|
82
|
+
query_emb: List[float],
|
|
83
|
+
top_k: int,
|
|
84
|
+
node_type: Optional[str],
|
|
85
|
+
file_filter: Optional[str],
|
|
86
|
+
) -> List[SearchResult]:
|
|
87
|
+
assert self.store.vector_store is not None
|
|
88
|
+
|
|
89
|
+
# Build SQL WHERE clause for hybrid search
|
|
90
|
+
clauses: List[str] = []
|
|
91
|
+
if node_type:
|
|
92
|
+
clauses.append(f'node_type = "{node_type}"')
|
|
93
|
+
if file_filter:
|
|
94
|
+
clauses.append(f'file_path LIKE "{file_filter}"')
|
|
95
|
+
where_sql = " AND ".join(clauses) if clauses else None
|
|
96
|
+
|
|
97
|
+
raw_results = self.store.vector_store.hybrid_search(
|
|
98
|
+
query_embedding=query_emb,
|
|
99
|
+
n_results=top_k,
|
|
100
|
+
where_sql=where_sql,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
results: List[SearchResult] = []
|
|
104
|
+
for row in raw_results:
|
|
105
|
+
distance = row.get("_distance", 0.0)
|
|
106
|
+
# LanceDB returns L2 distance by default; convert to a similarity
|
|
107
|
+
# score in [0, 1]. For cosine distance the relationship is
|
|
108
|
+
# score = 1 - distance (since embeddings are unit-normalised).
|
|
109
|
+
score = max(0.0, 1.0 - distance)
|
|
110
|
+
|
|
111
|
+
# Enrich from SQLite if full node data is needed
|
|
112
|
+
node_row = self.store.get_node(row.get("id", ""))
|
|
113
|
+
|
|
114
|
+
if node_row is not None:
|
|
115
|
+
results.append(SearchResult(
|
|
116
|
+
node_id=node_row["node_id"],
|
|
117
|
+
score=score,
|
|
118
|
+
node_type=node_row["node_type"],
|
|
119
|
+
qualname=node_row["qualname"],
|
|
120
|
+
file_path=node_row["file_path"],
|
|
121
|
+
start_line=node_row["start_line"],
|
|
122
|
+
end_line=node_row["end_line"],
|
|
123
|
+
snippet=node_row["code"],
|
|
124
|
+
))
|
|
125
|
+
else:
|
|
126
|
+
# Use data straight from LanceDB
|
|
127
|
+
results.append(SearchResult(
|
|
128
|
+
node_id=row.get("id", ""),
|
|
129
|
+
score=score,
|
|
130
|
+
node_type=row.get("node_type", ""),
|
|
131
|
+
qualname=row.get("qualname", ""),
|
|
132
|
+
file_path=row.get("file_path", ""),
|
|
133
|
+
start_line=0,
|
|
134
|
+
end_line=0,
|
|
135
|
+
snippet=row.get("document", ""),
|
|
136
|
+
))
|
|
137
|
+
|
|
138
|
+
return results
|
|
139
|
+
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
# Brute-force fallback
|
|
142
|
+
# ------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def _search_brute_force(
|
|
145
|
+
self,
|
|
146
|
+
query_emb: List[float],
|
|
147
|
+
top_k: int,
|
|
148
|
+
node_type: Optional[str],
|
|
149
|
+
) -> List[SearchResult]:
|
|
150
|
+
results: List[SearchResult] = []
|
|
151
|
+
for row in self.store.get_nodes():
|
|
152
|
+
if node_type and row["node_type"] != node_type:
|
|
153
|
+
continue
|
|
154
|
+
embedding = json.loads(row["embedding"] or "[]")
|
|
155
|
+
score = cosine_similarity(query_emb, embedding)
|
|
156
|
+
if score <= 0:
|
|
157
|
+
continue
|
|
158
|
+
results.append(SearchResult(
|
|
159
|
+
node_id=row["node_id"],
|
|
160
|
+
score=score,
|
|
161
|
+
node_type=row["node_type"],
|
|
162
|
+
qualname=row["qualname"],
|
|
163
|
+
file_path=row["file_path"],
|
|
164
|
+
start_line=row["start_line"],
|
|
165
|
+
end_line=row["end_line"],
|
|
166
|
+
snippet=row["code"],
|
|
167
|
+
))
|
|
168
|
+
|
|
169
|
+
return sorted(results, key=lambda r: r.score, reverse=True)[:top_k]
|
|
170
|
+
|
|
171
|
+
# ------------------------------------------------------------------
|
|
172
|
+
# Convenience
|
|
173
|
+
# ------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
def retrieve_context(
|
|
176
|
+
self,
|
|
177
|
+
query: str,
|
|
178
|
+
top_k: int = 5,
|
|
179
|
+
node_type: Optional[str] = None,
|
|
180
|
+
file_filter: Optional[str] = None,
|
|
181
|
+
) -> str:
|
|
182
|
+
"""Return a formatted string of the top search results.
|
|
183
|
+
|
|
184
|
+
Useful for injecting relevant code context into LLM prompts.
|
|
185
|
+
"""
|
|
186
|
+
matches = self.search(
|
|
187
|
+
query, top_k=top_k, node_type=node_type, file_filter=file_filter,
|
|
188
|
+
)
|
|
189
|
+
if not matches:
|
|
190
|
+
return "No relevant nodes found."
|
|
191
|
+
|
|
192
|
+
blocks: List[str] = []
|
|
193
|
+
for item in matches:
|
|
194
|
+
blocks.append(
|
|
195
|
+
f"[{item.node_type}] {item.qualname} "
|
|
196
|
+
f"({item.file_path}:{item.start_line})\n"
|
|
197
|
+
f"Score: {item.score:.3f}\n"
|
|
198
|
+
f"```python\n{item.snippet[:1200]}\n```"
|
|
199
|
+
)
|
|
200
|
+
return "\n\n".join(blocks)
|