dug-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dug/__init__.py +0 -0
- dug/__main__.py +297 -0
- dug/chunker.py +137 -0
- dug/config.py +77 -0
- dug/embeddings.py +97 -0
- dug/git_context.py +56 -0
- dug/graph.py +423 -0
- dug/history.py +231 -0
- dug/hooks.py +112 -0
- dug/indexer.py +294 -0
- dug/prompt_builder.py +106 -0
- dug/retriever.py +249 -0
- dug/vector_store.py +79 -0
- dug/verifier.py +73 -0
- dug/watcher.py +103 -0
- dug_cli-0.1.0.dist-info/METADATA +178 -0
- dug_cli-0.1.0.dist-info/RECORD +20 -0
- dug_cli-0.1.0.dist-info/WHEEL +4 -0
- dug_cli-0.1.0.dist-info/entry_points.txt +2 -0
- dug_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
dug/graph.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
"""Structural knowledge graph — FILE, SYMBOL, and COMMIT nodes with edges."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import subprocess
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import networkx as nx
|
|
12
|
+
|
|
13
|
+
from .git_context import Commit, get_git_history
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Language helpers
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
LANG_EXTENSIONS: dict[str, list[str]] = {
|
|
20
|
+
"python": [".py"],
|
|
21
|
+
"java": [".java"],
|
|
22
|
+
"typescript": [".ts", ".tsx"],
|
|
23
|
+
"javascript": [".js", ".jsx"],
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
IMPORT_PATTERNS: dict[str, list[str]] = {
|
|
27
|
+
"python": [r"^import\s+\S+", r"^from\s+\S+\s+import"],
|
|
28
|
+
"java": [r"^import\s+\S+"],
|
|
29
|
+
"typescript": [r"^import\s+", r'require\('],
|
|
30
|
+
"javascript": [r"^import\s+", r'require\('],
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _ext_to_lang(ext: str) -> str | None:
|
|
35
|
+
for lang, exts in LANG_EXTENSIONS.items():
|
|
36
|
+
if ext in exts:
|
|
37
|
+
return lang
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Node dataclasses (stored as node attributes in networkx)
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class FileNode:
|
|
47
|
+
path: str # relative to repo root
|
|
48
|
+
language: str
|
|
49
|
+
last_modified: float
|
|
50
|
+
size: int
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class SymbolNode:
|
|
55
|
+
name: str
|
|
56
|
+
kind: str # function / class / method
|
|
57
|
+
file_path: str
|
|
58
|
+
line_number: int
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Graph builder
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
class CodeGraph:
|
|
66
|
+
def __init__(self):
|
|
67
|
+
self.g: nx.DiGraph = nx.DiGraph()
|
|
68
|
+
|
|
69
|
+
# -- persistence -------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
def save(self, path: Path) -> None:
|
|
72
|
+
data = nx.node_link_data(self.g)
|
|
73
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
with open(path, "w") as f:
|
|
75
|
+
json.dump(data, f, indent=2)
|
|
76
|
+
|
|
77
|
+
def load(self, path: Path) -> None:
|
|
78
|
+
if not path.exists():
|
|
79
|
+
return
|
|
80
|
+
with open(path) as f:
|
|
81
|
+
data = json.load(f)
|
|
82
|
+
self.g = nx.node_link_graph(data)
|
|
83
|
+
|
|
84
|
+
# -- file nodes --------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
def add_file(self, path: Path, root: Path) -> str:
|
|
87
|
+
rel = str(path.relative_to(root))
|
|
88
|
+
lang = _ext_to_lang(path.suffix) or "unknown"
|
|
89
|
+
stat = path.stat()
|
|
90
|
+
node_id = f"file:{rel}"
|
|
91
|
+
self.g.add_node(
|
|
92
|
+
node_id,
|
|
93
|
+
kind="FILE",
|
|
94
|
+
path=rel,
|
|
95
|
+
language=lang,
|
|
96
|
+
last_modified=stat.st_mtime,
|
|
97
|
+
size=stat.st_size,
|
|
98
|
+
)
|
|
99
|
+
return node_id
|
|
100
|
+
|
|
101
|
+
def file_nodes(self) -> list[dict]:
|
|
102
|
+
return [
|
|
103
|
+
{"id": n, **d}
|
|
104
|
+
for n, d in self.g.nodes(data=True)
|
|
105
|
+
if d.get("kind") == "FILE"
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
# -- symbol nodes ------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
def add_symbol(self, name: str, kind: str, file_path: str, line: int) -> str:
|
|
111
|
+
node_id = f"sym:{file_path}:{name}:{line}"
|
|
112
|
+
self.g.add_node(
|
|
113
|
+
node_id,
|
|
114
|
+
kind="SYMBOL",
|
|
115
|
+
name=name,
|
|
116
|
+
symbol_kind=kind,
|
|
117
|
+
file_path=file_path,
|
|
118
|
+
line_number=line,
|
|
119
|
+
)
|
|
120
|
+
file_id = f"file:{file_path}"
|
|
121
|
+
if self.g.has_node(file_id):
|
|
122
|
+
self.g.add_edge(file_id, node_id, rel="contains")
|
|
123
|
+
return node_id
|
|
124
|
+
|
|
125
|
+
# -- import edges ------------------------------------------------------
|
|
126
|
+
|
|
127
|
+
def add_import_edge(self, from_file: str, to_file: str) -> None:
|
|
128
|
+
src = f"file:{from_file}"
|
|
129
|
+
dst = f"file:{to_file}"
|
|
130
|
+
if self.g.has_node(src) and self.g.has_node(dst):
|
|
131
|
+
self.g.add_edge(src, dst, rel="imports")
|
|
132
|
+
|
|
133
|
+
# -- commit nodes ------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
def add_commit(self, commit: Commit, root: Path) -> str:
|
|
136
|
+
node_id = f"commit:{commit.hash}"
|
|
137
|
+
self.g.add_node(
|
|
138
|
+
node_id,
|
|
139
|
+
kind="COMMIT",
|
|
140
|
+
hash=commit.hash,
|
|
141
|
+
message=commit.message,
|
|
142
|
+
timestamp=commit.timestamp.isoformat(),
|
|
143
|
+
)
|
|
144
|
+
for rel_path in commit.files_touched:
|
|
145
|
+
file_id = f"file:{rel_path}"
|
|
146
|
+
if self.g.has_node(file_id):
|
|
147
|
+
self.g.add_edge(node_id, file_id, rel="modified")
|
|
148
|
+
return node_id
|
|
149
|
+
|
|
150
|
+
# -- lookup helpers ----------------------------------------------------
|
|
151
|
+
|
|
152
|
+
def find_file_nodes_for_symbol(self, symbol: str) -> list[str]:
|
|
153
|
+
results = []
|
|
154
|
+
for n, d in self.g.nodes(data=True):
|
|
155
|
+
if d.get("kind") == "SYMBOL" and d.get("name") == symbol:
|
|
156
|
+
file_id = f"file:{d['file_path']}"
|
|
157
|
+
if file_id not in results:
|
|
158
|
+
results.append(file_id)
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
def get_import_neighbors(self, file_id: str, hops: int = 2) -> dict[str, int]:
|
|
162
|
+
"""Return file_ids reachable within `hops` import edges, with hop distance."""
|
|
163
|
+
visited: dict[str, int] = {}
|
|
164
|
+
frontier = [file_id]
|
|
165
|
+
for hop in range(1, hops + 1):
|
|
166
|
+
next_frontier = []
|
|
167
|
+
for node in frontier:
|
|
168
|
+
for neighbor in list(self.g.successors(node)) + list(self.g.predecessors(node)):
|
|
169
|
+
if self.g.nodes[neighbor].get("kind") == "FILE" and neighbor not in visited:
|
|
170
|
+
visited[neighbor] = hop
|
|
171
|
+
next_frontier.append(neighbor)
|
|
172
|
+
frontier = next_frontier
|
|
173
|
+
return visited
|
|
174
|
+
|
|
175
|
+
def stats(self) -> dict:
|
|
176
|
+
kinds: dict[str, int] = {}
|
|
177
|
+
for _, d in self.g.nodes(data=True):
|
|
178
|
+
k = d.get("kind", "UNKNOWN")
|
|
179
|
+
kinds[k] = kinds.get(k, 0) + 1
|
|
180
|
+
return {"nodes": dict(kinds), "edges": self.g.number_of_edges()}
|
|
181
|
+
|
|
182
|
+
# -- incremental update helpers ----------------------------------------
|
|
183
|
+
|
|
184
|
+
def remove_file_data(self, rel_path: str) -> None:
|
|
185
|
+
"""Remove FILE node, all its SYMBOL nodes, and all their edges."""
|
|
186
|
+
file_id = f"file:{rel_path}"
|
|
187
|
+
|
|
188
|
+
sym_nodes = [
|
|
189
|
+
n for n, d in self.g.nodes(data=True)
|
|
190
|
+
if d.get("kind") == "SYMBOL" and d.get("file_path") == rel_path
|
|
191
|
+
]
|
|
192
|
+
for sym in sym_nodes:
|
|
193
|
+
self.g.remove_node(sym)
|
|
194
|
+
|
|
195
|
+
if self.g.has_node(file_id):
|
|
196
|
+
self.g.remove_node(file_id) # networkx removes all edges automatically
|
|
197
|
+
|
|
198
|
+
def update_file_data(self, file_path: Path, root: Path,
|
|
199
|
+
all_file_rels: set[str]) -> None:
|
|
200
|
+
"""Remove stale data for a file then re-add fresh nodes and edges."""
|
|
201
|
+
rel = str(file_path.relative_to(root))
|
|
202
|
+
self.remove_file_data(rel)
|
|
203
|
+
|
|
204
|
+
if not file_path.exists():
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
self.add_file(file_path, root)
|
|
208
|
+
|
|
209
|
+
lang = _ext_to_lang(file_path.suffix)
|
|
210
|
+
if not lang:
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
for sym in extract_symbols_ripgrep(file_path, root):
|
|
214
|
+
self.add_symbol(sym["name"], sym["kind"], sym["file"], sym["line"])
|
|
215
|
+
|
|
216
|
+
for imp in extract_imports(file_path, root, lang):
|
|
217
|
+
target = _resolve_import_to_file(imp, all_file_rels, lang)
|
|
218
|
+
if target and target != rel:
|
|
219
|
+
self.add_import_edge(rel, target)
|
|
220
|
+
|
|
221
|
+
def prune_stale_nodes(self, root: Path) -> list[str]:
|
|
222
|
+
"""Remove FILE nodes whose path no longer exists on disk."""
|
|
223
|
+
stale = [
|
|
224
|
+
d["path"]
|
|
225
|
+
for _, d in list(self.g.nodes(data=True))
|
|
226
|
+
if d.get("kind") == "FILE" and not (root / d.get("path", "")).exists()
|
|
227
|
+
]
|
|
228
|
+
for rel_path in stale:
|
|
229
|
+
self.remove_file_data(rel_path)
|
|
230
|
+
return stale
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
# Walk + symbol extraction
|
|
235
|
+
# ---------------------------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
def _should_ignore(path: Path, ignore_paths: list[str]) -> bool:
|
|
238
|
+
path_str = str(path)
|
|
239
|
+
return any(ig in path.parts or ig in path_str for ig in ignore_paths)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def walk_repo(root: Path, ignore_paths: list[str], languages: list[str]) -> list[Path]:
|
|
243
|
+
valid_exts: set[str] = set()
|
|
244
|
+
for lang in languages:
|
|
245
|
+
valid_exts.update(LANG_EXTENSIONS.get(lang, []))
|
|
246
|
+
|
|
247
|
+
files = []
|
|
248
|
+
for p in root.rglob("*"):
|
|
249
|
+
if p.is_file() and p.suffix in valid_exts and not _should_ignore(p, ignore_paths):
|
|
250
|
+
files.append(p)
|
|
251
|
+
return files
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def extract_symbols_ctags(root: Path) -> list[dict]:
|
|
255
|
+
"""Run universal-ctags and parse JSON output."""
|
|
256
|
+
try:
|
|
257
|
+
result = subprocess.run(
|
|
258
|
+
["ctags", "-R", "--output-format=json", "--fields=+n", "."],
|
|
259
|
+
capture_output=True,
|
|
260
|
+
text=True,
|
|
261
|
+
cwd=root,
|
|
262
|
+
timeout=10,
|
|
263
|
+
)
|
|
264
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
265
|
+
return []
|
|
266
|
+
|
|
267
|
+
symbols = []
|
|
268
|
+
for line in result.stdout.splitlines():
|
|
269
|
+
line = line.strip()
|
|
270
|
+
if not line:
|
|
271
|
+
continue
|
|
272
|
+
try:
|
|
273
|
+
obj = json.loads(line)
|
|
274
|
+
if obj.get("_type") != "tag":
|
|
275
|
+
continue
|
|
276
|
+
symbols.append({
|
|
277
|
+
"name": obj.get("name", ""),
|
|
278
|
+
"kind": obj.get("kind", "unknown"),
|
|
279
|
+
"file": obj.get("path", ""),
|
|
280
|
+
"line": obj.get("line", 0),
|
|
281
|
+
})
|
|
282
|
+
except json.JSONDecodeError:
|
|
283
|
+
continue
|
|
284
|
+
return symbols
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def extract_symbols_ripgrep(file_path: Path, root: Path) -> list[dict]:
|
|
288
|
+
"""Fallback symbol extraction using ripgrep patterns."""
|
|
289
|
+
patterns = [
|
|
290
|
+
(r"^def ([A-Za-z_]\w*)\s*\(", "function"),
|
|
291
|
+
(r"^class ([A-Za-z_]\w*)\s*[:(]", "class"),
|
|
292
|
+
(r"^function ([A-Za-z_]\w*)\s*\(", "function"),
|
|
293
|
+
(r"^\s+(?:public|private|protected)\s+\w+\s+([A-Za-z_]\w*)\s*\(", "method"),
|
|
294
|
+
]
|
|
295
|
+
rel = str(file_path.relative_to(root))
|
|
296
|
+
symbols = []
|
|
297
|
+
for pattern, kind in patterns:
|
|
298
|
+
try:
|
|
299
|
+
result = subprocess.run(
|
|
300
|
+
["rg", "--line-number", "--no-heading", pattern, str(file_path)],
|
|
301
|
+
capture_output=True,
|
|
302
|
+
text=True,
|
|
303
|
+
)
|
|
304
|
+
for line in result.stdout.splitlines():
|
|
305
|
+
parts = line.split(":", 1)
|
|
306
|
+
if len(parts) < 2:
|
|
307
|
+
continue
|
|
308
|
+
try:
|
|
309
|
+
lineno = int(parts[0])
|
|
310
|
+
except ValueError:
|
|
311
|
+
continue
|
|
312
|
+
content = parts[1]
|
|
313
|
+
match = re.search(pattern, content)
|
|
314
|
+
if match:
|
|
315
|
+
symbols.append({
|
|
316
|
+
"name": match.group(1),
|
|
317
|
+
"kind": kind,
|
|
318
|
+
"file": rel,
|
|
319
|
+
"line": lineno,
|
|
320
|
+
})
|
|
321
|
+
except FileNotFoundError:
|
|
322
|
+
break
|
|
323
|
+
return symbols
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def extract_imports(file_path: Path, root: Path, language: str) -> list[str]:
|
|
327
|
+
"""Return list of imported module/file strings found in `file_path`."""
|
|
328
|
+
patterns = IMPORT_PATTERNS.get(language, [])
|
|
329
|
+
imports = []
|
|
330
|
+
for pattern in patterns:
|
|
331
|
+
try:
|
|
332
|
+
result = subprocess.run(
|
|
333
|
+
["rg", "--no-heading", "--no-line-number", pattern, str(file_path)],
|
|
334
|
+
capture_output=True,
|
|
335
|
+
text=True,
|
|
336
|
+
)
|
|
337
|
+
for line in result.stdout.splitlines():
|
|
338
|
+
imports.append(line.strip())
|
|
339
|
+
except FileNotFoundError:
|
|
340
|
+
break
|
|
341
|
+
return imports
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _resolve_import_to_file(import_line: str, all_file_rels: set[str], language: str) -> str | None:
|
|
345
|
+
"""Best-effort: map an import statement to a file node path."""
|
|
346
|
+
# Python: "from dug.graph import CodeGraph" → look for dug/graph.py
|
|
347
|
+
# Java: "import com.example.Foo" → com/example/Foo.java
|
|
348
|
+
# JS/TS: "import ... from './utils'" → utils.ts / utils.js
|
|
349
|
+
if language == "python":
|
|
350
|
+
m = re.search(r"^from ([\w.]+) import|^import ([\w.]+)", import_line)
|
|
351
|
+
if m:
|
|
352
|
+
mod = (m.group(1) or m.group(2)).replace(".", "/")
|
|
353
|
+
for ext in [".py"]:
|
|
354
|
+
candidate = mod + ext
|
|
355
|
+
if candidate in all_file_rels:
|
|
356
|
+
return candidate
|
|
357
|
+
elif language in ("typescript", "javascript"):
|
|
358
|
+
m = re.search(r"""from\s+['"]([^'"]+)['"]""", import_line)
|
|
359
|
+
if m:
|
|
360
|
+
raw = m.group(1)
|
|
361
|
+
for ext in [".ts", ".tsx", ".js", ".jsx"]:
|
|
362
|
+
candidate = raw.lstrip("./") + ext
|
|
363
|
+
for f in all_file_rels:
|
|
364
|
+
if f.endswith(candidate):
|
|
365
|
+
return f
|
|
366
|
+
elif language == "java":
|
|
367
|
+
m = re.search(r"^import\s+([\w.]+);", import_line)
|
|
368
|
+
if m:
|
|
369
|
+
candidate = m.group(1).replace(".", "/") + ".java"
|
|
370
|
+
if candidate in all_file_rels:
|
|
371
|
+
return candidate
|
|
372
|
+
return None
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# ---------------------------------------------------------------------------
|
|
376
|
+
# Public build function
|
|
377
|
+
# ---------------------------------------------------------------------------
|
|
378
|
+
|
|
379
|
+
def build_graph(root: Path, config: dict) -> CodeGraph:
|
|
380
|
+
ignore_paths = config.get("ignore_paths", [])
|
|
381
|
+
languages = config.get("languages", [])
|
|
382
|
+
git_depth = config.get("git_history_depth", 50)
|
|
383
|
+
|
|
384
|
+
graph = CodeGraph()
|
|
385
|
+
files = walk_repo(root, ignore_paths, languages)
|
|
386
|
+
|
|
387
|
+
# FILE nodes
|
|
388
|
+
for f in files:
|
|
389
|
+
graph.add_file(f, root)
|
|
390
|
+
|
|
391
|
+
all_file_rels: set[str] = {str(f.relative_to(root)) for f in files}
|
|
392
|
+
|
|
393
|
+
# SYMBOL nodes — try ctags first, fall back to ripgrep per file
|
|
394
|
+
ctags_symbols = extract_symbols_ctags(root)
|
|
395
|
+
if ctags_symbols:
|
|
396
|
+
for sym in ctags_symbols:
|
|
397
|
+
rel = sym["file"]
|
|
398
|
+
if rel in all_file_rels:
|
|
399
|
+
graph.add_symbol(sym["name"], sym["kind"], rel, sym["line"])
|
|
400
|
+
else:
|
|
401
|
+
for f in files:
|
|
402
|
+
lang = _ext_to_lang(f.suffix)
|
|
403
|
+
if lang:
|
|
404
|
+
for sym in extract_symbols_ripgrep(f, root):
|
|
405
|
+
graph.add_symbol(sym["name"], sym["kind"], sym["file"], sym["line"])
|
|
406
|
+
|
|
407
|
+
# FILE→FILE import edges
|
|
408
|
+
for f in files:
|
|
409
|
+
lang = _ext_to_lang(f.suffix)
|
|
410
|
+
if not lang:
|
|
411
|
+
continue
|
|
412
|
+
imports = extract_imports(f, root, lang)
|
|
413
|
+
rel = str(f.relative_to(root))
|
|
414
|
+
for imp in imports:
|
|
415
|
+
target = _resolve_import_to_file(imp, all_file_rels, lang)
|
|
416
|
+
if target and target != rel:
|
|
417
|
+
graph.add_import_edge(rel, target)
|
|
418
|
+
|
|
419
|
+
# COMMIT nodes
|
|
420
|
+
for commit in get_git_history(root, depth=git_depth):
|
|
421
|
+
graph.add_commit(commit, root)
|
|
422
|
+
|
|
423
|
+
return graph
|
dug/history.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Learning loop — stores past bug→file resolutions and boosts similar future queries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from difflib import SequenceMatcher
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .config import get_dug_dir
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
# Persistence
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
def get_history_path() -> Path:
|
|
19
|
+
return get_dug_dir() / "history.json"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_last_query_path() -> Path:
|
|
23
|
+
return get_dug_dir() / "last_query.json"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_history() -> list[dict]:
|
|
27
|
+
p = get_history_path()
|
|
28
|
+
if not p.exists():
|
|
29
|
+
return []
|
|
30
|
+
with open(p) as f:
|
|
31
|
+
return json.load(f)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def save_history(entries: list[dict]) -> None:
|
|
35
|
+
p = get_history_path()
|
|
36
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
with open(p, "w") as f:
|
|
38
|
+
json.dump(entries, f, indent=2)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def save_last_query(bug_input: str, ranked_file_paths: list[str], signals: dict) -> None:
|
|
42
|
+
p = get_last_query_path()
|
|
43
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
with open(p, "w") as f:
|
|
45
|
+
json.dump({
|
|
46
|
+
"bug_input": bug_input,
|
|
47
|
+
"ranked_files": ranked_file_paths,
|
|
48
|
+
"signals": signals,
|
|
49
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
50
|
+
}, f, indent=2)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_last_query() -> dict | None:
|
|
54
|
+
p = get_last_query_path()
|
|
55
|
+
if not p.exists():
|
|
56
|
+
return None
|
|
57
|
+
with open(p) as f:
|
|
58
|
+
return json.load(f)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Record a resolved bug
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def record_resolved(bug_input: str, resolved_files: list[str], signals: dict) -> None:
|
|
66
|
+
"""Append a resolved bug entry to history."""
|
|
67
|
+
entries = load_history()
|
|
68
|
+
entry_id = hashlib.md5(bug_input.encode()).hexdigest()
|
|
69
|
+
|
|
70
|
+
# Update existing entry if same bug was solved before
|
|
71
|
+
for entry in entries:
|
|
72
|
+
if entry["id"] == entry_id:
|
|
73
|
+
entry["resolved_files"] = list(dict.fromkeys(
|
|
74
|
+
entry["resolved_files"] + resolved_files
|
|
75
|
+
))
|
|
76
|
+
entry["solve_count"] = entry.get("solve_count", 1) + 1
|
|
77
|
+
entry["last_solved"] = datetime.now(timezone.utc).isoformat()
|
|
78
|
+
save_history(entries)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
entries.append({
|
|
82
|
+
"id": entry_id,
|
|
83
|
+
"bug_input": bug_input,
|
|
84
|
+
"error_type": signals.get("error_type"),
|
|
85
|
+
"signals": {
|
|
86
|
+
"files": signals.get("files", []),
|
|
87
|
+
"symbols": signals.get("symbols", []),
|
|
88
|
+
},
|
|
89
|
+
"resolved_files": resolved_files,
|
|
90
|
+
"solve_count": 1,
|
|
91
|
+
"last_solved": datetime.now(timezone.utc).isoformat(),
|
|
92
|
+
})
|
|
93
|
+
save_history(entries)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# Similarity matching
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
_STOPWORDS = {"the", "a", "an", "in", "at", "on", "is", "was", "with",
|
|
101
|
+
"and", "or", "for", "to", "of", "from", "that", "this",
|
|
102
|
+
"it", "not", "by", "be", "are", "has", "have", "had"}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _word_tokens(text: str) -> set[str]:
|
|
106
|
+
"""
|
|
107
|
+
Significant words from a string, with CamelCase and snake_case splitting.
|
|
108
|
+
'NullPointerException' → {'null', 'pointer', 'exception'}
|
|
109
|
+
'load_config' → {'load', 'config'}
|
|
110
|
+
"""
|
|
111
|
+
import re
|
|
112
|
+
# Split CamelCase: NullPointerException → Null Pointer Exception
|
|
113
|
+
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
|
|
114
|
+
# Split on everything non-alpha (underscores, dots, colons, spaces, etc.)
|
|
115
|
+
words = re.findall(r'[a-zA-Z]+', text.lower())
|
|
116
|
+
return {w for w in words if len(w) > 3 and w not in _STOPWORDS}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _text_similarity(a: str, b: str) -> float:
|
|
120
|
+
"""Blend of character-level SequenceMatcher and word-level Jaccard."""
|
|
121
|
+
char_sim = SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
122
|
+
|
|
123
|
+
words_a = _word_tokens(a)
|
|
124
|
+
words_b = _word_tokens(b)
|
|
125
|
+
union = words_a | words_b
|
|
126
|
+
word_sim = len(words_a & words_b) / len(union) if union else 0.0
|
|
127
|
+
|
|
128
|
+
return char_sim * 0.4 + word_sim * 0.6
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _signals_overlap(signals_a: dict, signals_b: dict) -> float:
|
|
132
|
+
"""Fraction of shared files/symbols between two signal dicts (0–1)."""
|
|
133
|
+
files_a = set(signals_a.get("files", []))
|
|
134
|
+
files_b = set(signals_b.get("files", []))
|
|
135
|
+
syms_a = set(signals_a.get("symbols", []))
|
|
136
|
+
syms_b = set(signals_b.get("symbols", []))
|
|
137
|
+
|
|
138
|
+
total = len(files_a | files_b) + len(syms_a | syms_b)
|
|
139
|
+
if total == 0:
|
|
140
|
+
return 0.0
|
|
141
|
+
shared = len(files_a & files_b) + len(syms_a & syms_b)
|
|
142
|
+
return shared / total
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def find_similar_past_bugs(
|
|
146
|
+
bug_input: str,
|
|
147
|
+
signals: dict,
|
|
148
|
+
threshold: float = 0.35,
|
|
149
|
+
) -> list[dict]:
|
|
150
|
+
"""
|
|
151
|
+
Return past entries that are similar to the current bug.
|
|
152
|
+
Combines text similarity + error type match + signal overlap.
|
|
153
|
+
"""
|
|
154
|
+
entries = load_history()
|
|
155
|
+
similar = []
|
|
156
|
+
|
|
157
|
+
current_error = (signals.get("error_type") or "").lower()
|
|
158
|
+
|
|
159
|
+
for entry in entries:
|
|
160
|
+
text_sim = _text_similarity(bug_input, entry["bug_input"])
|
|
161
|
+
|
|
162
|
+
# Error type exact match gives a strong boost
|
|
163
|
+
entry_error = (entry.get("error_type") or "").lower()
|
|
164
|
+
error_bonus = 0.2 if current_error and current_error == entry_error else 0.0
|
|
165
|
+
|
|
166
|
+
sig_overlap = _signals_overlap(signals, entry.get("signals", {}))
|
|
167
|
+
|
|
168
|
+
score = text_sim * 0.6 + sig_overlap * 0.25 + error_bonus
|
|
169
|
+
|
|
170
|
+
if score >= threshold:
|
|
171
|
+
similar.append({**entry, "_similarity": round(score, 3)})
|
|
172
|
+
|
|
173
|
+
return sorted(similar, key=lambda x: x["_similarity"], reverse=True)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# Scoring boost
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
def get_history_boost(
|
|
181
|
+
bug_input: str,
|
|
182
|
+
signals: dict,
|
|
183
|
+
candidate_files: list[str],
|
|
184
|
+
) -> dict[str, float]:
|
|
185
|
+
"""
|
|
186
|
+
Return {file_path: boost_score} for files that resolved similar past bugs.
|
|
187
|
+
Boost is +6, scaled by similarity (so a 0.9-similar past bug gives +5.4).
|
|
188
|
+
"""
|
|
189
|
+
similar = find_similar_past_bugs(bug_input, signals)
|
|
190
|
+
boosts: dict[str, float] = {}
|
|
191
|
+
for past in similar:
|
|
192
|
+
sim = past["_similarity"]
|
|
193
|
+
for filepath in past["resolved_files"]:
|
|
194
|
+
if filepath in candidate_files:
|
|
195
|
+
pts = 6.0 * sim
|
|
196
|
+
boosts[filepath] = max(boosts.get(filepath, 0.0), pts)
|
|
197
|
+
return boosts
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ---------------------------------------------------------------------------
|
|
201
|
+
# Error pattern library
|
|
202
|
+
# ---------------------------------------------------------------------------
|
|
203
|
+
|
|
204
|
+
def get_error_pattern_boost(
|
|
205
|
+
error_type: str | None,
|
|
206
|
+
candidate_files: list[str],
|
|
207
|
+
) -> dict[str, float]:
|
|
208
|
+
"""
|
|
209
|
+
Boost files that have historically appeared alongside a specific error type.
|
|
210
|
+
Derived entirely from accumulated history — no hardcoded rules.
|
|
211
|
+
"""
|
|
212
|
+
if not error_type:
|
|
213
|
+
return {}
|
|
214
|
+
|
|
215
|
+
entries = load_history()
|
|
216
|
+
frequency: dict[str, int] = {}
|
|
217
|
+
total = 0
|
|
218
|
+
|
|
219
|
+
for entry in entries:
|
|
220
|
+
if (entry.get("error_type") or "").lower() == error_type.lower():
|
|
221
|
+
for fp in entry.get("resolved_files", []):
|
|
222
|
+
if fp in candidate_files:
|
|
223
|
+
frequency[fp] = frequency.get(fp, 0) + 1
|
|
224
|
+
total += 1
|
|
225
|
+
|
|
226
|
+
if total == 0:
|
|
227
|
+
return {}
|
|
228
|
+
|
|
229
|
+
# Normalize to 0–3 boost range
|
|
230
|
+
max_freq = max(frequency.values())
|
|
231
|
+
return {fp: (count / max_freq) * 3.0 for fp, count in frequency.items()}
|