luckyd-code 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- luckyd_code/__init__.py +54 -0
- luckyd_code/__main__.py +5 -0
- luckyd_code/_agent_loop.py +551 -0
- luckyd_code/_data_dir.py +73 -0
- luckyd_code/agent.py +38 -0
- luckyd_code/analytics/__init__.py +18 -0
- luckyd_code/analytics/reporter.py +195 -0
- luckyd_code/analytics/scanner.py +443 -0
- luckyd_code/analytics/smells.py +316 -0
- luckyd_code/analytics/trends.py +303 -0
- luckyd_code/api.py +473 -0
- luckyd_code/audit_daemon.py +845 -0
- luckyd_code/autonomous_fixer.py +473 -0
- luckyd_code/background.py +159 -0
- luckyd_code/backup.py +237 -0
- luckyd_code/brain/__init__.py +84 -0
- luckyd_code/brain/assembler.py +100 -0
- luckyd_code/brain/chunker.py +345 -0
- luckyd_code/brain/constants.py +73 -0
- luckyd_code/brain/embedder.py +163 -0
- luckyd_code/brain/graph.py +311 -0
- luckyd_code/brain/indexer.py +316 -0
- luckyd_code/brain/parser.py +140 -0
- luckyd_code/brain/retriever.py +234 -0
- luckyd_code/cli.py +894 -0
- luckyd_code/cli_commands/__init__.py +1 -0
- luckyd_code/cli_commands/audit.py +120 -0
- luckyd_code/cli_commands/background.py +83 -0
- luckyd_code/cli_commands/brain.py +87 -0
- luckyd_code/cli_commands/config.py +75 -0
- luckyd_code/cli_commands/dispatcher.py +695 -0
- luckyd_code/cli_commands/sessions.py +41 -0
- luckyd_code/cli_entry.py +147 -0
- luckyd_code/cli_utils.py +112 -0
- luckyd_code/config.py +205 -0
- luckyd_code/context.py +214 -0
- luckyd_code/cost_tracker.py +209 -0
- luckyd_code/error_reporter.py +508 -0
- luckyd_code/exceptions.py +39 -0
- luckyd_code/export.py +126 -0
- luckyd_code/feedback_analyzer.py +290 -0
- luckyd_code/file_watcher.py +258 -0
- luckyd_code/git/__init__.py +11 -0
- luckyd_code/git/auto_commit.py +157 -0
- luckyd_code/git/tools.py +85 -0
- luckyd_code/hooks.py +236 -0
- luckyd_code/indexer.py +280 -0
- luckyd_code/init.py +39 -0
- luckyd_code/keybindings.py +77 -0
- luckyd_code/log.py +55 -0
- luckyd_code/mcp/__init__.py +6 -0
- luckyd_code/mcp/client.py +184 -0
- luckyd_code/memory/__init__.py +19 -0
- luckyd_code/memory/manager.py +339 -0
- luckyd_code/metrics/__init__.py +5 -0
- luckyd_code/model_registry.py +131 -0
- luckyd_code/orchestrator.py +204 -0
- luckyd_code/permissions/__init__.py +1 -0
- luckyd_code/permissions/manager.py +103 -0
- luckyd_code/planner.py +361 -0
- luckyd_code/plugins.py +91 -0
- luckyd_code/py.typed +0 -0
- luckyd_code/retry.py +57 -0
- luckyd_code/router.py +417 -0
- luckyd_code/sandbox.py +156 -0
- luckyd_code/self_critique.py +2 -0
- luckyd_code/self_improve.py +274 -0
- luckyd_code/sessions.py +114 -0
- luckyd_code/settings.py +72 -0
- luckyd_code/skills/__init__.py +8 -0
- luckyd_code/skills/review.py +22 -0
- luckyd_code/skills/security.py +17 -0
- luckyd_code/tasks/__init__.py +1 -0
- luckyd_code/tasks/manager.py +102 -0
- luckyd_code/templates/icon-192.png +0 -0
- luckyd_code/templates/icon-512.png +0 -0
- luckyd_code/templates/index.html +1965 -0
- luckyd_code/templates/manifest.json +14 -0
- luckyd_code/templates/src/app.js +694 -0
- luckyd_code/templates/src/body.html +767 -0
- luckyd_code/templates/src/cdn.txt +2 -0
- luckyd_code/templates/src/style.css +474 -0
- luckyd_code/templates/sw.js +31 -0
- luckyd_code/templates/test.html +6 -0
- luckyd_code/themes.py +48 -0
- luckyd_code/tools/__init__.py +97 -0
- luckyd_code/tools/agent_tools.py +65 -0
- luckyd_code/tools/bash.py +360 -0
- luckyd_code/tools/brain_tools.py +137 -0
- luckyd_code/tools/browser.py +369 -0
- luckyd_code/tools/datetime_tool.py +34 -0
- luckyd_code/tools/dockerfile_gen.py +212 -0
- luckyd_code/tools/file_ops.py +381 -0
- luckyd_code/tools/game_gen.py +360 -0
- luckyd_code/tools/git_tools.py +130 -0
- luckyd_code/tools/git_worktree.py +63 -0
- luckyd_code/tools/path_validate.py +64 -0
- luckyd_code/tools/project_gen.py +187 -0
- luckyd_code/tools/readme_gen.py +227 -0
- luckyd_code/tools/registry.py +157 -0
- luckyd_code/tools/shell_detect.py +109 -0
- luckyd_code/tools/web.py +89 -0
- luckyd_code/tools/youtube.py +187 -0
- luckyd_code/tools_bridge.py +144 -0
- luckyd_code/undo.py +126 -0
- luckyd_code/update.py +60 -0
- luckyd_code/verify.py +360 -0
- luckyd_code/web_app.py +176 -0
- luckyd_code/web_routes/__init__.py +23 -0
- luckyd_code/web_routes/background.py +73 -0
- luckyd_code/web_routes/brain.py +109 -0
- luckyd_code/web_routes/cost.py +12 -0
- luckyd_code/web_routes/files.py +133 -0
- luckyd_code/web_routes/memories.py +94 -0
- luckyd_code/web_routes/misc.py +67 -0
- luckyd_code/web_routes/project.py +48 -0
- luckyd_code/web_routes/review.py +20 -0
- luckyd_code/web_routes/sessions.py +44 -0
- luckyd_code/web_routes/settings.py +43 -0
- luckyd_code/web_routes/static.py +70 -0
- luckyd_code/web_routes/update.py +19 -0
- luckyd_code/web_routes/ws.py +237 -0
- luckyd_code-1.2.2.dist-info/METADATA +297 -0
- luckyd_code-1.2.2.dist-info/RECORD +127 -0
- luckyd_code-1.2.2.dist-info/WHEEL +4 -0
- luckyd_code-1.2.2.dist-info/entry_points.txt +3 -0
- luckyd_code-1.2.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""Knowledge graph — stores and queries codebase structure across sessions."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ..log import get_logger
|
|
9
|
+
from .constants import BRAIN_DIR
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
GRAPH_FILE = BRAIN_DIR / "graph.json"
|
|
13
|
+
|
|
14
|
+
Node = dict[str, Any]
|
|
15
|
+
Edge = dict[str, str]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class KnowledgeGraph:
|
|
19
|
+
"""Persistent knowledge graph of codebase structure.
|
|
20
|
+
|
|
21
|
+
Nodes: modules, classes, functions
|
|
22
|
+
Edges: imports, contains, calls, inherits
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self) -> None:
|
|
26
|
+
self.nodes: dict[str, Node] = {}
|
|
27
|
+
self.edges: list[Edge] = []
|
|
28
|
+
self.stats: dict[str, Any] = {
|
|
29
|
+
"node_count": 0,
|
|
30
|
+
"edge_count": 0,
|
|
31
|
+
"last_built": 0,
|
|
32
|
+
"files_parsed": 0,
|
|
33
|
+
"errors": 0,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
def build(self, project_root: str, parsed_files: list[dict[str, Any]]) -> None:
|
|
37
|
+
self.nodes = {}
|
|
38
|
+
self.edges = []
|
|
39
|
+
self.stats["last_built"] = time.time()
|
|
40
|
+
self.stats["files_parsed"] = len(parsed_files)
|
|
41
|
+
self.stats["errors"] = 0
|
|
42
|
+
|
|
43
|
+
for pf in parsed_files:
|
|
44
|
+
if pf["errors"]:
|
|
45
|
+
self.stats["errors"] += len(pf["errors"])
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
rel_path = pf["module"]
|
|
49
|
+
module_id = f"module:{rel_path}"
|
|
50
|
+
|
|
51
|
+
self.nodes[module_id] = {
|
|
52
|
+
"type": "module",
|
|
53
|
+
"name": Path(rel_path).name,
|
|
54
|
+
"file": rel_path,
|
|
55
|
+
"line": 1,
|
|
56
|
+
"doc": "",
|
|
57
|
+
"size": pf["size"],
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
for imp in pf["imports"]:
|
|
61
|
+
import_id = f"import:{imp['module']}:{imp['name']}"
|
|
62
|
+
if import_id not in self.nodes:
|
|
63
|
+
self.nodes[import_id] = {
|
|
64
|
+
"type": "import",
|
|
65
|
+
"name": imp["name"],
|
|
66
|
+
"module": imp["module"],
|
|
67
|
+
"alias": imp.get("alias"),
|
|
68
|
+
"file": rel_path,
|
|
69
|
+
"line": 0,
|
|
70
|
+
"doc": "",
|
|
71
|
+
}
|
|
72
|
+
self.edges.append({"from": module_id, "to": import_id, "type": "imports"})
|
|
73
|
+
|
|
74
|
+
for cls in pf["classes"]:
|
|
75
|
+
cls_id = f"class:{rel_path}:{cls['name']}"
|
|
76
|
+
self.nodes[cls_id] = {
|
|
77
|
+
"type": "class",
|
|
78
|
+
"name": cls["name"],
|
|
79
|
+
"file": rel_path,
|
|
80
|
+
"line": cls["line"],
|
|
81
|
+
"end_line": cls["end_line"],
|
|
82
|
+
"bases": cls["base_names"],
|
|
83
|
+
"decorators": cls["decorators"],
|
|
84
|
+
"doc": cls["docstring"][:200],
|
|
85
|
+
}
|
|
86
|
+
self.edges.append({"from": module_id, "to": cls_id, "type": "contains"})
|
|
87
|
+
|
|
88
|
+
for base in cls["base_names"]:
|
|
89
|
+
if base and base != "object":
|
|
90
|
+
self.edges.append({
|
|
91
|
+
"from": cls_id, "to": f"class:??:{base}", "type": "inherits"
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
for method in cls["methods"]:
|
|
95
|
+
method_id = f"method:{rel_path}:{cls['name']}.{method['name']}"
|
|
96
|
+
self.nodes[method_id] = {
|
|
97
|
+
"type": "method",
|
|
98
|
+
"name": method["name"],
|
|
99
|
+
"class": cls["name"],
|
|
100
|
+
"file": rel_path,
|
|
101
|
+
"line": method["line"],
|
|
102
|
+
"end_line": method["end_line"],
|
|
103
|
+
"decorators": method["decorators"],
|
|
104
|
+
"doc": method["docstring"][:200],
|
|
105
|
+
}
|
|
106
|
+
self.edges.append({"from": cls_id, "to": method_id, "type": "contains"})
|
|
107
|
+
for call in method["calls"]:
|
|
108
|
+
self.edges.append({"from": method_id, "to": f"func:??:{call}", "type": "calls"})
|
|
109
|
+
|
|
110
|
+
for func in pf["functions"]:
|
|
111
|
+
func_id = f"func:{rel_path}:{func['name']}"
|
|
112
|
+
self.nodes[func_id] = {
|
|
113
|
+
"type": "function",
|
|
114
|
+
"name": func["name"],
|
|
115
|
+
"file": rel_path,
|
|
116
|
+
"line": func["line"],
|
|
117
|
+
"end_line": func["end_line"],
|
|
118
|
+
"decorators": func["decorators"],
|
|
119
|
+
"doc": func["docstring"][:200],
|
|
120
|
+
}
|
|
121
|
+
self.edges.append({"from": module_id, "to": func_id, "type": "contains"})
|
|
122
|
+
for call in func["calls"]:
|
|
123
|
+
self.edges.append({"from": func_id, "to": f"func:??:{call}", "type": "calls"})
|
|
124
|
+
|
|
125
|
+
self.stats["node_count"] = len(self.nodes)
|
|
126
|
+
self.stats["edge_count"] = len(self.edges)
|
|
127
|
+
|
|
128
|
+
# --- Persistence ---
|
|
129
|
+
|
|
130
|
+
def save(self) -> None:
|
|
131
|
+
BRAIN_DIR.mkdir(parents=True, exist_ok=True)
|
|
132
|
+
data = {"nodes": self.nodes, "edges": self.edges, "stats": self.stats}
|
|
133
|
+
GRAPH_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8")
|
|
134
|
+
|
|
135
|
+
def load(self) -> bool:
|
|
136
|
+
if GRAPH_FILE.exists():
|
|
137
|
+
try:
|
|
138
|
+
data: Any = json.loads(GRAPH_FILE.read_text(encoding="utf-8"))
|
|
139
|
+
self.nodes = data.get("nodes", {})
|
|
140
|
+
self.edges = data.get("edges", {}) if isinstance(data.get("edges"), dict) else data.get("edges", [])
|
|
141
|
+
self.stats = data.get("stats", {})
|
|
142
|
+
return True
|
|
143
|
+
except (json.JSONDecodeError, OSError):
|
|
144
|
+
get_logger().warning("Could not load knowledge graph from %s", GRAPH_FILE, exc_info=True)
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
def search(self, query: str, max_results: int = 15) -> list[Node]:
|
|
148
|
+
q = query.lower()
|
|
149
|
+
scored: list[tuple[int, str, Node]] = []
|
|
150
|
+
|
|
151
|
+
for nid, node in self.nodes.items():
|
|
152
|
+
score = 0
|
|
153
|
+
if q in node.get("name", "").lower():
|
|
154
|
+
score += 10
|
|
155
|
+
if q in node.get("file", "").lower():
|
|
156
|
+
score += 5
|
|
157
|
+
if q in node.get("doc", "").lower():
|
|
158
|
+
score += 3
|
|
159
|
+
if q in node.get("module", "").lower():
|
|
160
|
+
score += 2
|
|
161
|
+
if q in node.get("class", "").lower():
|
|
162
|
+
score += 2
|
|
163
|
+
if score > 0:
|
|
164
|
+
scored.append((score, nid, node))
|
|
165
|
+
|
|
166
|
+
scored.sort(key=lambda x: -x[0])
|
|
167
|
+
seen: set[str] = set()
|
|
168
|
+
top: list[Node] = []
|
|
169
|
+
for _score, nid, node in scored[:max_results]:
|
|
170
|
+
if nid not in seen:
|
|
171
|
+
top.append(node)
|
|
172
|
+
seen.add(nid)
|
|
173
|
+
return top
|
|
174
|
+
|
|
175
|
+
def get_related(self, node_id: str, max_depth: int = 1) -> list[Node]:
|
|
176
|
+
related: set[str] = set()
|
|
177
|
+
current = {node_id}
|
|
178
|
+
|
|
179
|
+
for _ in range(max_depth):
|
|
180
|
+
next_set: set[str] = set()
|
|
181
|
+
for edge in self.edges:
|
|
182
|
+
if isinstance(edge, dict):
|
|
183
|
+
if edge["from"] in current:
|
|
184
|
+
next_set.add(edge["to"])
|
|
185
|
+
if edge["to"] in current:
|
|
186
|
+
next_set.add(edge["from"])
|
|
187
|
+
current = next_set
|
|
188
|
+
related.update(current)
|
|
189
|
+
|
|
190
|
+
return [
|
|
191
|
+
self.nodes.get(nid, {"name": nid, "type": "unknown", "file": ""})
|
|
192
|
+
for nid in related if nid != node_id
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
def get_by_file(self, filepath: str) -> list[Node]:
|
|
196
|
+
return [
|
|
197
|
+
node for node in self.nodes.values()
|
|
198
|
+
if node.get("file", "").endswith(filepath)
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
def get_by_type(self, node_type: str) -> list[Node]:
|
|
202
|
+
return [
|
|
203
|
+
node for node in self.nodes.values()
|
|
204
|
+
if node.get("type") == node_type
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
def find_dependents(self, symbol_name: str, max_results: int = 15) -> list[dict]:
|
|
208
|
+
"""Find all nodes that depend on a symbol by traversing incoming edges."""
|
|
209
|
+
matches = self.search(symbol_name, max_results=5)
|
|
210
|
+
if not matches:
|
|
211
|
+
return []
|
|
212
|
+
|
|
213
|
+
# Build a lookup of known node IDs so we can match even with incomplete IDs
|
|
214
|
+
node_ids = set(self.nodes.keys())
|
|
215
|
+
dependents: list[dict] = []
|
|
216
|
+
seen: set[str] = set()
|
|
217
|
+
|
|
218
|
+
for match in matches:
|
|
219
|
+
# Generate the most likely node ID for this match
|
|
220
|
+
candidate_ids = [
|
|
221
|
+
f"{match['type']}:{match.get('file', '')}:{match['name']}",
|
|
222
|
+
f"func:??:{match['name']}",
|
|
223
|
+
f"method:??:{match['name']}",
|
|
224
|
+
f"class:??:{match['name']}",
|
|
225
|
+
]
|
|
226
|
+
matched_id = None
|
|
227
|
+
for cid in candidate_ids:
|
|
228
|
+
if cid in node_ids:
|
|
229
|
+
matched_id = cid
|
|
230
|
+
break
|
|
231
|
+
|
|
232
|
+
if not matched_id:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
for edge in self.edges:
|
|
236
|
+
if isinstance(edge, dict) and edge.get("to") == matched_id:
|
|
237
|
+
src = self.nodes.get(edge["from"])
|
|
238
|
+
if src and edge["from"] not in seen:
|
|
239
|
+
seen.add(edge["from"])
|
|
240
|
+
dependents.append({
|
|
241
|
+
"name": f"{src.get('type', '?')}:{src.get('name', '?')}",
|
|
242
|
+
"file": src.get("file", ""),
|
|
243
|
+
"relation": edge.get("type", ""),
|
|
244
|
+
"line": src.get("line", 0),
|
|
245
|
+
})
|
|
246
|
+
|
|
247
|
+
dependents.sort(key=lambda x: (x["file"], x["line"]))
|
|
248
|
+
return dependents[:max_results]
|
|
249
|
+
|
|
250
|
+
def summarize(self, max_modules: int = 20) -> str:
|
|
251
|
+
lines = ["<knowledge-graph>"]
|
|
252
|
+
lines.append(f"Graph: {self.stats.get('node_count', 0)} symbols, "
|
|
253
|
+
f"{self.stats.get('edge_count', 0)} relationships, "
|
|
254
|
+
f"{self.stats.get('files_parsed', 0)} files")
|
|
255
|
+
|
|
256
|
+
by_file: dict[str, list[Node]] = {}
|
|
257
|
+
for node in self.nodes.values():
|
|
258
|
+
f = node.get("file", "")
|
|
259
|
+
if f:
|
|
260
|
+
by_file.setdefault(f, []).append(node)
|
|
261
|
+
|
|
262
|
+
count = 0
|
|
263
|
+
for filepath, nodes in sorted(by_file.items()):
|
|
264
|
+
if count >= max_modules:
|
|
265
|
+
break
|
|
266
|
+
count += 1
|
|
267
|
+
classes = [n for n in nodes if n["type"] == "class"]
|
|
268
|
+
functions = [n for n in nodes if n["type"] == "function"]
|
|
269
|
+
|
|
270
|
+
short_path = Path(filepath).name
|
|
271
|
+
parts = [short_path]
|
|
272
|
+
if classes:
|
|
273
|
+
parts.append(f"classes={{{','.join(c['name'] for c in classes)}}}")
|
|
274
|
+
if functions:
|
|
275
|
+
parts.append(f"functions={{{','.join(f['name'] for f in functions)}}}")
|
|
276
|
+
lines.append(f" {' | '.join(parts)}")
|
|
277
|
+
|
|
278
|
+
lines.append("</knowledge-graph>")
|
|
279
|
+
return "\n".join(lines)
|
|
280
|
+
|
|
281
|
+
def stats_text(self) -> str:
|
|
282
|
+
by_type: dict[str, int] = {}
|
|
283
|
+
for node in self.nodes.values():
|
|
284
|
+
t = node.get("type", "unknown")
|
|
285
|
+
by_type[t] = by_type.get(t, 0) + 1
|
|
286
|
+
|
|
287
|
+
by_file: dict[str, int] = {}
|
|
288
|
+
for node in self.nodes.values():
|
|
289
|
+
f = node.get("file", "")
|
|
290
|
+
if f:
|
|
291
|
+
by_file[f] = by_file.get(f, 0) + 1
|
|
292
|
+
|
|
293
|
+
lines = [
|
|
294
|
+
f"Nodes: {self.stats.get('node_count', 0)}",
|
|
295
|
+
f"Edges: {self.stats.get('edge_count', 0)}",
|
|
296
|
+
f"Files parsed: {self.stats.get('files_parsed', 0)}",
|
|
297
|
+
f"Parse errors: {self.stats.get('errors', 0)}",
|
|
298
|
+
]
|
|
299
|
+
if by_type:
|
|
300
|
+
lines.append("\nBy type:")
|
|
301
|
+
for t, c in sorted(by_type.items(), key=lambda x: -x[1]):
|
|
302
|
+
lines.append(f" {t}: {c}")
|
|
303
|
+
if by_file:
|
|
304
|
+
lines.append("\nBy file:")
|
|
305
|
+
for f, c in sorted(by_file.items(), key=lambda x: -x[1])[:20]:
|
|
306
|
+
lines.append(f" {f}: {c} symbols")
|
|
307
|
+
if self.stats.get("last_built"):
|
|
308
|
+
last = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(self.stats["last_built"]))
|
|
309
|
+
lines.append(f"\nLast built: {last}")
|
|
310
|
+
|
|
311
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""Vector indexer — builds and queries FAISS vector index for code chunks."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from ..log import get_logger
|
|
10
|
+
from .constants import BRAIN_DIR, LANGUAGE_MAP, SKIP_DIRS
|
|
11
|
+
|
|
12
|
+
INDEX_FILE = BRAIN_DIR / "index.faiss"
|
|
13
|
+
CHUNKS_FILE = BRAIN_DIR / "chunks.json"
|
|
14
|
+
MTIMES_FILE = BRAIN_DIR / "mtimes.json"
|
|
15
|
+
STATS_FILE = BRAIN_DIR / "stats.json"
|
|
16
|
+
|
|
17
|
+
CHUNK_SIZE = 384 # all-MiniLM-L6-v2 dimension
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VectorIndexer:
|
|
21
|
+
"""Manages the FAISS vector index with mtime tracking."""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.index: Any = None # FAISS index object
|
|
25
|
+
self.chunks: list[dict[str, Any]] = []
|
|
26
|
+
self.file_mtimes: dict[str, tuple[float, int]] = {}
|
|
27
|
+
self.stats: dict[str, Any] = {
|
|
28
|
+
"chunks": 0,
|
|
29
|
+
"files": 0,
|
|
30
|
+
"languages": {},
|
|
31
|
+
"last_indexed": 0,
|
|
32
|
+
"dimension": 0,
|
|
33
|
+
"index_size_bytes": 0,
|
|
34
|
+
}
|
|
35
|
+
self._faiss_available = False
|
|
36
|
+
|
|
37
|
+
def _check_deps(self) -> bool:
|
|
38
|
+
"""Check if FAISS and numpy are available."""
|
|
39
|
+
if not self._faiss_available:
|
|
40
|
+
try:
|
|
41
|
+
import faiss
|
|
42
|
+
import numpy as np
|
|
43
|
+
|
|
44
|
+
self._faiss = faiss
|
|
45
|
+
self._np = np
|
|
46
|
+
self._faiss_available = True
|
|
47
|
+
return True
|
|
48
|
+
except ImportError:
|
|
49
|
+
get_logger().info(
|
|
50
|
+
"faiss-cpu not available. Vector search disabled. "
|
|
51
|
+
"Install with: pip install faiss-cpu"
|
|
52
|
+
)
|
|
53
|
+
return False
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
def build(self, chunks: list[dict[str, Any]]) -> dict[str, Any]:
|
|
57
|
+
"""Build the FAISS index from chunks.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
chunks: List of chunk dicts from chunker.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Stats dict.
|
|
64
|
+
"""
|
|
65
|
+
from .embedder import get_embedder
|
|
66
|
+
|
|
67
|
+
if not self._check_deps():
|
|
68
|
+
self.stats["chunks"] = len(chunks)
|
|
69
|
+
self.stats["files"] = len(set(c["file_path"] for c in chunks))
|
|
70
|
+
self.stats["last_indexed"] = time.time()
|
|
71
|
+
return self.stats
|
|
72
|
+
|
|
73
|
+
embedder = get_embedder()
|
|
74
|
+
if not embedder.available:
|
|
75
|
+
self.stats["chunks"] = len(chunks)
|
|
76
|
+
self.stats["files"] = len(set(c["file_path"] for c in chunks))
|
|
77
|
+
self.stats["last_indexed"] = time.time()
|
|
78
|
+
return self.stats
|
|
79
|
+
|
|
80
|
+
if not chunks:
|
|
81
|
+
self.chunks = []
|
|
82
|
+
self.index = None
|
|
83
|
+
self.stats["chunks"] = 0
|
|
84
|
+
self.stats["files"] = 0
|
|
85
|
+
self.stats["languages"] = {}
|
|
86
|
+
self.stats["last_indexed"] = time.time()
|
|
87
|
+
return self.stats
|
|
88
|
+
|
|
89
|
+
# Track languages
|
|
90
|
+
languages: dict[str, int] = {}
|
|
91
|
+
for c in chunks:
|
|
92
|
+
lang = c.get("language", "unknown")
|
|
93
|
+
languages[lang] = languages.get(lang, 0) + 1
|
|
94
|
+
|
|
95
|
+
# Sort chunks by file_path then start_line for stable ordering
|
|
96
|
+
chunks.sort(key=lambda c: (c["file_path"], c.get("start_line", 0)))
|
|
97
|
+
self.chunks = chunks
|
|
98
|
+
|
|
99
|
+
# Embed all chunk contents
|
|
100
|
+
texts = [c.get("content", "") for c in chunks]
|
|
101
|
+
embeddings = embedder.embed(texts)
|
|
102
|
+
|
|
103
|
+
if embeddings is None:
|
|
104
|
+
self.stats["chunks"] = len(chunks)
|
|
105
|
+
self.stats["files"] = len(set(c["file_path"] for c in chunks))
|
|
106
|
+
self.stats["last_indexed"] = time.time()
|
|
107
|
+
return self.stats
|
|
108
|
+
|
|
109
|
+
# Build FAISS index
|
|
110
|
+
dim = len(embeddings[0])
|
|
111
|
+
idx = self._faiss.IndexFlatIP(dim) # Inner product = cosine sim for normalized vectors
|
|
112
|
+
vectors = self._np.array(embeddings, dtype=self._np.float32)
|
|
113
|
+
|
|
114
|
+
# Normalize vectors for cosine similarity
|
|
115
|
+
self._faiss.normalize_L2(vectors)
|
|
116
|
+
idx.add(vectors)
|
|
117
|
+
|
|
118
|
+
self.index = idx
|
|
119
|
+
self.stats = {
|
|
120
|
+
"chunks": len(chunks),
|
|
121
|
+
"files": len(set(c["file_path"] for c in chunks)),
|
|
122
|
+
"languages": languages,
|
|
123
|
+
"last_indexed": time.time(),
|
|
124
|
+
"dimension": dim,
|
|
125
|
+
"index_size_bytes": 0,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return self.stats
|
|
129
|
+
|
|
130
|
+
def search(
|
|
131
|
+
self, query: str, k: int = 10
|
|
132
|
+
) -> list[dict[str, Any]]:
|
|
133
|
+
"""Search the index by embedding the query.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
query: Natural language search query.
|
|
137
|
+
k: Number of results to return.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
List of chunk dicts with a 'score' key added.
|
|
141
|
+
"""
|
|
142
|
+
from .embedder import get_embedder
|
|
143
|
+
|
|
144
|
+
if not self._check_deps() or self.index is None or self.index.ntotal == 0:
|
|
145
|
+
return []
|
|
146
|
+
|
|
147
|
+
embedder = get_embedder()
|
|
148
|
+
if not embedder.available:
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
query_vec = embedder.embed_query(query)
|
|
152
|
+
if query_vec is None:
|
|
153
|
+
return []
|
|
154
|
+
|
|
155
|
+
# Normalize query vector
|
|
156
|
+
q = self._np.array([query_vec], dtype=self._np.float32)
|
|
157
|
+
self._faiss.normalize_L2(q)
|
|
158
|
+
|
|
159
|
+
k_actual = min(k, self.index.ntotal)
|
|
160
|
+
if k_actual == 0:
|
|
161
|
+
return []
|
|
162
|
+
|
|
163
|
+
scores, indices = self.index.search(q, k_actual)
|
|
164
|
+
|
|
165
|
+
results = []
|
|
166
|
+
for score, idx in zip(scores[0], indices[0]):
|
|
167
|
+
if idx < 0 or idx >= len(self.chunks):
|
|
168
|
+
continue
|
|
169
|
+
chunk = dict(self.chunks[idx])
|
|
170
|
+
chunk["score"] = float(score)
|
|
171
|
+
results.append(chunk)
|
|
172
|
+
|
|
173
|
+
return results
|
|
174
|
+
|
|
175
|
+
def save(self) -> bool:
|
|
176
|
+
"""Save the index, chunks, and mtimes to disk."""
|
|
177
|
+
BRAIN_DIR.mkdir(parents=True, exist_ok=True)
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
# Save FAISS index
|
|
181
|
+
if self._faiss_available and self.index is not None:
|
|
182
|
+
self._faiss.write_index(self.index, str(INDEX_FILE))
|
|
183
|
+
self.stats["index_size_bytes"] = INDEX_FILE.stat().st_size
|
|
184
|
+
|
|
185
|
+
# Save chunks with content
|
|
186
|
+
CHUNKS_FILE.write_text(
|
|
187
|
+
json.dumps(self.chunks, indent=2), encoding="utf-8"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Save mtimes
|
|
191
|
+
MTIMES_FILE.write_text(
|
|
192
|
+
json.dumps(self.file_mtimes), encoding="utf-8"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
# Save stats
|
|
196
|
+
STATS_FILE.write_text(
|
|
197
|
+
json.dumps(self.stats), encoding="utf-8"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return True
|
|
201
|
+
except Exception as exc:
|
|
202
|
+
get_logger().warning("Failed to save vector index: %s", exc)
|
|
203
|
+
return False
|
|
204
|
+
|
|
205
|
+
def load(self) -> bool:
|
|
206
|
+
"""Load the index and metadata from disk.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
True if index was loaded successfully.
|
|
210
|
+
"""
|
|
211
|
+
if not INDEX_FILE.exists() or not CHUNKS_FILE.exists():
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
self._check_deps()
|
|
216
|
+
|
|
217
|
+
# Load chunks
|
|
218
|
+
self.chunks = json.loads(CHUNKS_FILE.read_text(encoding="utf-8"))
|
|
219
|
+
if not self.chunks:
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
# Load FAISS index
|
|
223
|
+
if self._faiss_available and INDEX_FILE.exists():
|
|
224
|
+
self.index = self._faiss.read_index(str(INDEX_FILE))
|
|
225
|
+
|
|
226
|
+
# Load mtimes
|
|
227
|
+
if MTIMES_FILE.exists():
|
|
228
|
+
self.file_mtimes = json.loads(MTIMES_FILE.read_text(encoding="utf-8")) or {}
|
|
229
|
+
|
|
230
|
+
# Load stats
|
|
231
|
+
if STATS_FILE.exists():
|
|
232
|
+
self.stats = json.loads(STATS_FILE.read_text(encoding="utf-8")) or {}
|
|
233
|
+
|
|
234
|
+
return True
|
|
235
|
+
|
|
236
|
+
except Exception as exc:
|
|
237
|
+
get_logger().warning("Failed to load vector index: %s", exc)
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
def stats_text(self) -> str:
|
|
241
|
+
"""Return human-readable statistics."""
|
|
242
|
+
lines = [
|
|
243
|
+
f"Chunks indexed: {self.stats.get('chunks', 0)}",
|
|
244
|
+
f"Files: {self.stats.get('files', 0)}",
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
languages = self.stats.get("languages", {})
|
|
248
|
+
if languages:
|
|
249
|
+
lines.append(f"Languages: {', '.join(f'{k}={v}' for k, v in sorted(languages.items()))}")
|
|
250
|
+
|
|
251
|
+
dim = self.stats.get("dimension", 0)
|
|
252
|
+
if dim:
|
|
253
|
+
lines.append(f"Vector dimension: {dim}")
|
|
254
|
+
|
|
255
|
+
size = self.stats.get("index_size_bytes", 0)
|
|
256
|
+
if size:
|
|
257
|
+
if size < 1024:
|
|
258
|
+
size_str = f"{size} B"
|
|
259
|
+
elif size < 1024 * 1024:
|
|
260
|
+
size_str = f"{size / 1024:.1f} KB"
|
|
261
|
+
else:
|
|
262
|
+
size_str = f"{size / 1024 / 1024:.1f} MB"
|
|
263
|
+
lines.append(f"Index size: {size_str}")
|
|
264
|
+
|
|
265
|
+
last = self.stats.get("last_indexed", 0)
|
|
266
|
+
if last:
|
|
267
|
+
last_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(last))
|
|
268
|
+
lines.append(f"Last indexed: {last_str}")
|
|
269
|
+
|
|
270
|
+
if not self._faiss_available:
|
|
271
|
+
lines.append("FAISS not available (install faiss-cpu for vector search)")
|
|
272
|
+
|
|
273
|
+
return "\n".join(lines)
|
|
274
|
+
|
|
275
|
+
def get_changed_files(self, project_root: str) -> list[str]:
|
|
276
|
+
"""Check which files have changed since last index.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
project_root: Root directory to scan.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
List of file paths that have changed or are new.
|
|
283
|
+
"""
|
|
284
|
+
changed: list[str] = []
|
|
285
|
+
root = Path(project_root).resolve()
|
|
286
|
+
|
|
287
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
288
|
+
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS and not d.startswith(".")]
|
|
289
|
+
|
|
290
|
+
for fname in filenames:
|
|
291
|
+
suffix = Path(fname).suffix.lower()
|
|
292
|
+
if suffix not in LANGUAGE_MAP:
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
fpath = Path(dirpath) / fname
|
|
296
|
+
try:
|
|
297
|
+
st = fpath.stat()
|
|
298
|
+
mtime = st.st_mtime
|
|
299
|
+
size = st.st_size
|
|
300
|
+
except OSError:
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
fpath_str = str(fpath)
|
|
304
|
+
if fpath_str in self.file_mtimes:
|
|
305
|
+
old_mtime, old_size = self.file_mtimes[fpath_str]
|
|
306
|
+
if old_mtime == mtime and old_size == size:
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
changed.append(fpath_str)
|
|
310
|
+
|
|
311
|
+
return changed
|
|
312
|
+
|
|
313
|
+
@property
|
|
314
|
+
def is_available(self) -> bool:
|
|
315
|
+
"""Whether the index is loaded and ready."""
|
|
316
|
+
return self._faiss_available and self.index is not None and self.index.ntotal > 0
|