dug-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dug/__init__.py +0 -0
- dug/__main__.py +297 -0
- dug/chunker.py +137 -0
- dug/config.py +77 -0
- dug/embeddings.py +97 -0
- dug/git_context.py +56 -0
- dug/graph.py +423 -0
- dug/history.py +231 -0
- dug/hooks.py +112 -0
- dug/indexer.py +294 -0
- dug/prompt_builder.py +106 -0
- dug/retriever.py +249 -0
- dug/vector_store.py +79 -0
- dug/verifier.py +73 -0
- dug/watcher.py +103 -0
- dug_cli-0.1.0.dist-info/METADATA +178 -0
- dug_cli-0.1.0.dist-info/RECORD +20 -0
- dug_cli-0.1.0.dist-info/WHEEL +4 -0
- dug_cli-0.1.0.dist-info/entry_points.txt +2 -0
- dug_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
dug/retriever.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Hybrid retriever — merges structural graph lookup with semantic search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
# Test file patterns — excluded from ranked results by default
|
|
10
|
+
_TEST_PATTERNS = (
|
|
11
|
+
"test_", "_test.", ".test.", ".spec.", "_spec.",
|
|
12
|
+
"/test/", "/tests/", "/spec/", "/__tests__/",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _is_test_file(path: str) -> bool:
|
|
17
|
+
p = path.lower().replace("\\", "/")
|
|
18
|
+
return any(pat in p for pat in _TEST_PATTERNS)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _bug_tokens(text: str) -> set[str]:
|
|
22
|
+
"""Significant words from a bug string — reuses history.py logic inline."""
|
|
23
|
+
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
|
|
24
|
+
words = re.findall(r'[a-zA-Z]+', text.lower())
|
|
25
|
+
stopwords = {"the", "a", "an", "in", "at", "on", "is", "was", "with",
|
|
26
|
+
"and", "or", "for", "to", "of", "from", "that", "this"}
|
|
27
|
+
return {w for w in words if len(w) > 3 and w not in stopwords}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Signal extraction
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
_ERROR_TYPES = [
|
|
35
|
+
"NullPointerException", "NPE", "NullReferenceException",
|
|
36
|
+
"KeyError", "TypeError", "ValueError", "AttributeError",
|
|
37
|
+
"ImportError", "ModuleNotFoundError", "NameError",
|
|
38
|
+
"IndexError", "RuntimeError", "AssertionError",
|
|
39
|
+
"FileNotFoundError", "PermissionError", "TimeoutError",
|
|
40
|
+
"ConnectionError", "HTTPError", "404", "500", "503",
|
|
41
|
+
"StackOverflow", "OutOfMemoryError", "ClassNotFoundException",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def extract_signals(bug_input: str) -> dict:
|
|
46
|
+
"""Pull structured signals out of a raw bug string — pure regex, no LLM."""
|
|
47
|
+
files = re.findall(r'[\w/.-]+\.(?:java|py|ts|tsx|js|jsx)', bug_input)
|
|
48
|
+
# Java/Python stack trace symbols: "at ClassName.method(" or "in function_name"
|
|
49
|
+
symbols = re.findall(r'at\s+(\w+)(?:\.\w+)*\s*\(', bug_input)
|
|
50
|
+
symbols += re.findall(r'in\s+([a-z_]\w+)\b', bug_input)
|
|
51
|
+
symbols += re.findall(r'([A-Z]\w*(?:Service|Controller|Handler|Manager|Processor|Client|Repository|Util|Helper))', bug_input)
|
|
52
|
+
line_numbers = re.findall(r':(\d+)', bug_input)
|
|
53
|
+
error_type = next((e for e in _ERROR_TYPES if e.lower() in bug_input.lower()), None)
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
"files": list(dict.fromkeys(files)), # deduped, order preserved
|
|
57
|
+
"symbols": list(dict.fromkeys(symbols)),
|
|
58
|
+
"line_numbers": [int(n) for n in line_numbers],
|
|
59
|
+
"error_type": error_type,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
# Ranked file result
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class RankedFile:
|
|
69
|
+
path: str
|
|
70
|
+
score: float
|
|
71
|
+
reasons: list[str] = field(default_factory=list)
|
|
72
|
+
last_modified: float = 0.0
|
|
73
|
+
imports: list[str] = field(default_factory=list)
|
|
74
|
+
import_chain: list[str] = field(default_factory=list)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Scoring helpers
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def _score_structural(graph, signals: dict, bug_input: str = "") -> dict[str, float]:
|
|
82
|
+
"""Score files based on structural graph signals."""
|
|
83
|
+
scores: dict[str, float] = {}
|
|
84
|
+
reasons: dict[str, list[str]] = {}
|
|
85
|
+
|
|
86
|
+
def add(file_id: str, pts: float, reason: str) -> None:
|
|
87
|
+
path = file_id.removeprefix("file:")
|
|
88
|
+
scores[path] = scores.get(path, 0.0) + pts
|
|
89
|
+
reasons.setdefault(path, [])
|
|
90
|
+
if reason not in reasons[path]:
|
|
91
|
+
reasons[path].append(reason)
|
|
92
|
+
|
|
93
|
+
all_file_ids = {n for n, d in graph.g.nodes(data=True) if d.get("kind") == "FILE"}
|
|
94
|
+
|
|
95
|
+
# +10: file directly mentioned in the bug input
|
|
96
|
+
for sig_file in signals["files"]:
|
|
97
|
+
for fid in all_file_ids:
|
|
98
|
+
if sig_file in fid:
|
|
99
|
+
add(fid, 10, "directly in stack trace")
|
|
100
|
+
|
|
101
|
+
# +10: symbol mentioned → find file containing that symbol
|
|
102
|
+
for sym in signals["symbols"]:
|
|
103
|
+
for fid in graph.find_file_nodes_for_symbol(sym):
|
|
104
|
+
add(fid, 10, f"contains symbol '{sym}'")
|
|
105
|
+
|
|
106
|
+
# +5/+2: import neighbors of already-scored files
|
|
107
|
+
seeded = [f"file:{p}" for p in list(scores.keys())]
|
|
108
|
+
for fid in seeded:
|
|
109
|
+
neighbors = graph.get_import_neighbors(fid, hops=2)
|
|
110
|
+
for neighbor_id, hop in neighbors.items():
|
|
111
|
+
pts = 5 if hop == 1 else 2
|
|
112
|
+
label = "1-hop import neighbor" if hop == 1 else "2-hop import neighbor"
|
|
113
|
+
add(neighbor_id, pts, label)
|
|
114
|
+
|
|
115
|
+
# +8 if commit message shares tokens with bug; +2 if recently modified but unrelated
|
|
116
|
+
bug_tokens = _bug_tokens(bug_input)
|
|
117
|
+
commit_nodes = [
|
|
118
|
+
(n, d) for n, d in graph.g.nodes(data=True) if d.get("kind") == "COMMIT"
|
|
119
|
+
]
|
|
120
|
+
recent_commits = sorted(
|
|
121
|
+
commit_nodes, key=lambda x: x[1].get("timestamp", ""), reverse=True
|
|
122
|
+
)[:3]
|
|
123
|
+
for commit_id, commit_data in recent_commits:
|
|
124
|
+
msg_tokens = _bug_tokens(commit_data.get("message", ""))
|
|
125
|
+
relevant = bool(bug_tokens & msg_tokens) if bug_tokens else False
|
|
126
|
+
pts, label = (8, "modified in relevant recent commit") if relevant \
|
|
127
|
+
else (2, "modified recently (unrelated commit)")
|
|
128
|
+
for neighbor in graph.g.successors(commit_id):
|
|
129
|
+
if graph.g.nodes[neighbor].get("kind") == "FILE":
|
|
130
|
+
add(neighbor, pts, label)
|
|
131
|
+
|
|
132
|
+
return scores, reasons
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _score_semantic(semantic_hits: list[dict]) -> dict[str, float]:
|
|
136
|
+
"""Convert semantic search hits to file-level scores (+0 to +5)."""
|
|
137
|
+
scores: dict[str, float] = {}
|
|
138
|
+
for hit in semantic_hits:
|
|
139
|
+
path = hit["file_path"]
|
|
140
|
+
pts = hit["score"] * 5.0 # normalize 0–1 cosine → 0–5 points
|
|
141
|
+
scores[path] = max(scores.get(path, 0.0), pts)
|
|
142
|
+
return scores
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
# Import chain builder
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
def _build_import_chain(graph, file_path: str, max_hops: int = 4) -> list[str]:
|
|
150
|
+
"""Walk import edges outward from `file_path` and return a chain."""
|
|
151
|
+
chain = [file_path]
|
|
152
|
+
current = f"file:{file_path}"
|
|
153
|
+
seen = {current}
|
|
154
|
+
for _ in range(max_hops):
|
|
155
|
+
neighbors = [
|
|
156
|
+
n for n in graph.g.successors(current)
|
|
157
|
+
if graph.g.nodes[n].get("kind") == "FILE" and n not in seen
|
|
158
|
+
and graph.g.edges[current, n].get("rel") == "imports"
|
|
159
|
+
]
|
|
160
|
+
if not neighbors:
|
|
161
|
+
break
|
|
162
|
+
current = neighbors[0]
|
|
163
|
+
seen.add(current)
|
|
164
|
+
chain.append(current.removeprefix("file:"))
|
|
165
|
+
return chain
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
# Public API
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
|
|
172
|
+
def hybrid_search(
|
|
173
|
+
embedder,
|
|
174
|
+
graph,
|
|
175
|
+
vector_table,
|
|
176
|
+
bug_input: str,
|
|
177
|
+
top_k: int = 5,
|
|
178
|
+
) -> tuple[list[RankedFile], dict]:
|
|
179
|
+
"""
|
|
180
|
+
Combine structural + semantic + history signals, return ranked files + signals.
|
|
181
|
+
"""
|
|
182
|
+
signals = extract_signals(bug_input)
|
|
183
|
+
|
|
184
|
+
# Layer 1 — structural
|
|
185
|
+
struct_scores, struct_reasons = _score_structural(graph, signals, bug_input)
|
|
186
|
+
|
|
187
|
+
# Layer 2 — semantic
|
|
188
|
+
query_vector = embedder.embed(bug_input)
|
|
189
|
+
from .vector_store import search as vec_search
|
|
190
|
+
semantic_hits = vec_search(vector_table, query_vector, top_k=15)
|
|
191
|
+
sem_scores = _score_semantic(semantic_hits)
|
|
192
|
+
|
|
193
|
+
# Merge layers 1 + 2
|
|
194
|
+
all_paths = set(struct_scores) | set(sem_scores)
|
|
195
|
+
merged: dict[str, float] = {}
|
|
196
|
+
for path in all_paths:
|
|
197
|
+
merged[path] = struct_scores.get(path, 0.0) + sem_scores.get(path, 0.0)
|
|
198
|
+
|
|
199
|
+
# Layer 3 — history boost (+0 to +6 based on past resolutions)
|
|
200
|
+
from .history import get_history_boost, get_error_pattern_boost
|
|
201
|
+
candidate_files = list(merged.keys())
|
|
202
|
+
history_boosts = get_history_boost(bug_input, signals, candidate_files)
|
|
203
|
+
pattern_boosts = get_error_pattern_boost(signals.get("error_type"), candidate_files)
|
|
204
|
+
|
|
205
|
+
history_reasons: dict[str, str] = {}
|
|
206
|
+
for path, pts in history_boosts.items():
|
|
207
|
+
merged[path] = merged.get(path, 0.0) + pts
|
|
208
|
+
history_reasons[path] = f"resolved similar bug before (+{pts:.1f})"
|
|
209
|
+
|
|
210
|
+
for path, pts in pattern_boosts.items():
|
|
211
|
+
merged[path] = merged.get(path, 0.0) + pts
|
|
212
|
+
if path not in history_reasons:
|
|
213
|
+
history_reasons[path] = f"common in {signals.get('error_type')} errors (+{pts:.1f})"
|
|
214
|
+
|
|
215
|
+
# Build RankedFile objects — skip test files unless explicitly mentioned in input
|
|
216
|
+
explicitly_mentioned = {f.lower() for f in signals["files"]}
|
|
217
|
+
ranked = []
|
|
218
|
+
for path, score in sorted(merged.items(), key=lambda x: x[1], reverse=True):
|
|
219
|
+
if len(ranked) >= top_k:
|
|
220
|
+
break
|
|
221
|
+
if _is_test_file(path) and not any(t in path.lower() for t in explicitly_mentioned):
|
|
222
|
+
continue
|
|
223
|
+
file_id = f"file:{path}"
|
|
224
|
+
node_data = graph.g.nodes.get(file_id, {})
|
|
225
|
+
|
|
226
|
+
reasons = list(struct_reasons.get(path, []))
|
|
227
|
+
sem_score = sem_scores.get(path, 0.0)
|
|
228
|
+
if sem_score > 0:
|
|
229
|
+
reasons.append(f"semantic match ({sem_score:.2f}/5)")
|
|
230
|
+
if path in history_reasons:
|
|
231
|
+
reasons.append(history_reasons[path])
|
|
232
|
+
|
|
233
|
+
raw_imports = [
|
|
234
|
+
n.removeprefix("file:")
|
|
235
|
+
for n in graph.g.successors(file_id)
|
|
236
|
+
if graph.g.nodes.get(n, {}).get("kind") == "FILE"
|
|
237
|
+
and graph.g.edges.get((file_id, n), {}).get("rel") == "imports"
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
ranked.append(RankedFile(
|
|
241
|
+
path=path,
|
|
242
|
+
score=score,
|
|
243
|
+
reasons=reasons,
|
|
244
|
+
last_modified=node_data.get("last_modified", 0.0),
|
|
245
|
+
imports=raw_imports,
|
|
246
|
+
import_chain=_build_import_chain(graph, path),
|
|
247
|
+
))
|
|
248
|
+
|
|
249
|
+
return ranked, signals
|
dug/vector_store.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""LanceDB vector store — file-based, no server required."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import lancedb
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
|
|
10
|
+
# Dimension for all-MiniLM-L6-v2 (local). OpenAI text-embedding-3-small = 1536.
|
|
11
|
+
_DIM_LOCAL = 384
|
|
12
|
+
_DIM_OPENAI = 1536
|
|
13
|
+
|
|
14
|
+
TABLE_NAME = "functions"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _schema(dim: int) -> pa.Schema:
|
|
18
|
+
return pa.schema([
|
|
19
|
+
pa.field("chunk_id", pa.string()),
|
|
20
|
+
pa.field("file_path", pa.string()),
|
|
21
|
+
pa.field("function_name", pa.string()),
|
|
22
|
+
pa.field("start_line", pa.int32()),
|
|
23
|
+
pa.field("end_line", pa.int32()),
|
|
24
|
+
pa.field("language", pa.string()),
|
|
25
|
+
pa.field("vector", pa.list_(pa.float32(), dim)),
|
|
26
|
+
])
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_or_create_table(db_path: Path, embedding_mode: str = "local") -> lancedb.table.Table:
|
|
30
|
+
db_path.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
db = lancedb.connect(str(db_path))
|
|
32
|
+
dim = _DIM_OPENAI if embedding_mode == "openai" else _DIM_LOCAL
|
|
33
|
+
if TABLE_NAME in db.table_names():
|
|
34
|
+
return db.open_table(TABLE_NAME)
|
|
35
|
+
return db.create_table(TABLE_NAME, schema=_schema(dim))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def delete_file_chunks(table: lancedb.table.Table, rel_path: str) -> None:
|
|
39
|
+
"""Remove all chunk rows belonging to a specific file."""
|
|
40
|
+
try:
|
|
41
|
+
# LanceDB uses SQL-style string for delete predicate
|
|
42
|
+
table.delete(f"file_path = '{rel_path.replace(chr(39), chr(39)*2)}'")
|
|
43
|
+
except Exception:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def upsert_chunks(table: lancedb.table.Table, rows: list[dict]) -> None:
|
|
48
|
+
if not rows:
|
|
49
|
+
return
|
|
50
|
+
# LanceDB merge_insert: overwrite rows with matching chunk_id
|
|
51
|
+
table.merge_insert("chunk_id") \
|
|
52
|
+
.when_matched_update_all() \
|
|
53
|
+
.when_not_matched_insert_all() \
|
|
54
|
+
.execute(rows)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def search(
|
|
58
|
+
table: lancedb.table.Table,
|
|
59
|
+
query_vector: list[float],
|
|
60
|
+
top_k: int = 10,
|
|
61
|
+
) -> list[dict]:
|
|
62
|
+
results = (
|
|
63
|
+
table.search(query_vector)
|
|
64
|
+
.metric("cosine")
|
|
65
|
+
.limit(top_k)
|
|
66
|
+
.to_list()
|
|
67
|
+
)
|
|
68
|
+
hits = []
|
|
69
|
+
for row in results:
|
|
70
|
+
hits.append({
|
|
71
|
+
"chunk_id": row["chunk_id"],
|
|
72
|
+
"file_path": row["file_path"],
|
|
73
|
+
"function_name": row["function_name"],
|
|
74
|
+
"start_line": row["start_line"],
|
|
75
|
+
"end_line": row["end_line"],
|
|
76
|
+
"language": row["language"],
|
|
77
|
+
"score": 1.0 - row.get("_distance", 0.0), # cosine: distance→similarity
|
|
78
|
+
})
|
|
79
|
+
return hits
|
dug/verifier.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Verifier — confirms candidate files are genuinely relevant via ripgrep checks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Minimum file size to be worth surfacing (bytes) — filters out empty/stub files
|
|
9
|
+
_MIN_FILE_BYTES = 50
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _rg_contains(pattern: str, abs_path: Path, fixed: bool = True) -> bool:
|
|
13
|
+
flags = ["--fixed-strings"] if fixed else []
|
|
14
|
+
try:
|
|
15
|
+
result = subprocess.run(
|
|
16
|
+
["rg", *flags, "--quiet", pattern, str(abs_path)],
|
|
17
|
+
capture_output=True,
|
|
18
|
+
)
|
|
19
|
+
return result.returncode == 0
|
|
20
|
+
except FileNotFoundError:
|
|
21
|
+
return True # rg not available — assume true
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def verify_files(
|
|
25
|
+
candidate_files: list[str],
|
|
26
|
+
symbols: list[str],
|
|
27
|
+
root: Path,
|
|
28
|
+
bug_input: str = "",
|
|
29
|
+
) -> list[str]:
|
|
30
|
+
"""
|
|
31
|
+
Multi-pass verification — drops candidates that fail all checks.
|
|
32
|
+
|
|
33
|
+
Pass 1 (always): file must exist and be non-trivially sized.
|
|
34
|
+
Pass 2 (when symbols extracted): file must contain at least one symbol.
|
|
35
|
+
Pass 3 (when no symbols): file must contain at least one significant word
|
|
36
|
+
from the bug input — prevents completely unrelated files surfacing.
|
|
37
|
+
"""
|
|
38
|
+
confirmed = []
|
|
39
|
+
|
|
40
|
+
# Derive significant words from bug input for pass 3
|
|
41
|
+
import re
|
|
42
|
+
words = re.findall(r'[a-zA-Z]{4,}', bug_input.lower())
|
|
43
|
+
stopwords = {"with", "that", "this", "from", "have", "been", "when",
|
|
44
|
+
"error", "fail", "fails", "issue", "problem", "exception"}
|
|
45
|
+
sig_words = [w for w in words if w not in stopwords][:8] # top 8 words
|
|
46
|
+
|
|
47
|
+
for rel_path in candidate_files:
|
|
48
|
+
abs_path = root / rel_path
|
|
49
|
+
|
|
50
|
+
# Pass 1: existence + size
|
|
51
|
+
if not abs_path.exists():
|
|
52
|
+
continue
|
|
53
|
+
if abs_path.stat().st_size < _MIN_FILE_BYTES:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
# Pass 2: symbol presence (when symbols available)
|
|
57
|
+
if symbols:
|
|
58
|
+
if any(_rg_contains(sym, abs_path) for sym in symbols):
|
|
59
|
+
confirmed.append(rel_path)
|
|
60
|
+
# Don't add to confirmed if none of the symbols found
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
# Pass 3: word presence (when no symbols — guards against totally unrelated files)
|
|
64
|
+
if sig_words:
|
|
65
|
+
if any(_rg_contains(w, abs_path, fixed=True) for w in sig_words):
|
|
66
|
+
confirmed.append(rel_path)
|
|
67
|
+
else:
|
|
68
|
+
confirmed.append(rel_path) # soft pass — word match is best-effort
|
|
69
|
+
else:
|
|
70
|
+
confirmed.append(rel_path) # no words to check, pass through
|
|
71
|
+
|
|
72
|
+
# Safety net: never return empty — if all dropped, return originals
|
|
73
|
+
return confirmed if confirmed else candidate_files
|
dug/watcher.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""File watcher — OS-native filesystem events + 1.5s debounce before reindex."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from watchdog.events import FileSystemEventHandler
|
|
10
|
+
from watchdog.observers import Observer
|
|
11
|
+
|
|
12
|
+
from .graph import LANG_EXTENSIONS, _should_ignore
|
|
13
|
+
|
|
14
|
+
DEBOUNCE_SECONDS = 1.5
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _DebounceHandler(FileSystemEventHandler):
|
|
18
|
+
def __init__(self, root: Path, ignore_paths: list[str],
|
|
19
|
+
valid_exts: set[str], embedder):
|
|
20
|
+
self.root = root
|
|
21
|
+
self.ignore_paths = ignore_paths
|
|
22
|
+
self.valid_exts = valid_exts
|
|
23
|
+
self.embedder = embedder
|
|
24
|
+
self._timers: dict[str, threading.Timer] = {}
|
|
25
|
+
self._lock = threading.Lock()
|
|
26
|
+
|
|
27
|
+
# watchdog fires on_modified for saves, on_created for new files,
|
|
28
|
+
# on_deleted for deletions — all three need reindex
|
|
29
|
+
def on_modified(self, event):
|
|
30
|
+
self._handle(event)
|
|
31
|
+
|
|
32
|
+
def on_created(self, event):
|
|
33
|
+
self._handle(event)
|
|
34
|
+
|
|
35
|
+
def on_deleted(self, event):
|
|
36
|
+
self._handle(event)
|
|
37
|
+
|
|
38
|
+
def _handle(self, event):
|
|
39
|
+
if event.is_directory:
|
|
40
|
+
return
|
|
41
|
+
path = Path(event.src_path)
|
|
42
|
+
if path.suffix not in self.valid_exts:
|
|
43
|
+
return
|
|
44
|
+
if _should_ignore(path, self.ignore_paths):
|
|
45
|
+
return
|
|
46
|
+
self._schedule(path)
|
|
47
|
+
|
|
48
|
+
def _schedule(self, path: Path) -> None:
|
|
49
|
+
"""Debounce: reset the timer on every save, fire only after silence."""
|
|
50
|
+
key = str(path)
|
|
51
|
+
with self._lock:
|
|
52
|
+
if key in self._timers:
|
|
53
|
+
self._timers[key].cancel()
|
|
54
|
+
timer = threading.Timer(DEBOUNCE_SECONDS, self._reindex, args=[path])
|
|
55
|
+
self._timers[key] = timer
|
|
56
|
+
timer.start()
|
|
57
|
+
|
|
58
|
+
def _reindex(self, path: Path) -> None:
|
|
59
|
+
from .indexer import update_file
|
|
60
|
+
key = str(path)
|
|
61
|
+
with self._lock:
|
|
62
|
+
self._timers.pop(key, None)
|
|
63
|
+
try:
|
|
64
|
+
result = update_file(path, self.root, self.embedder)
|
|
65
|
+
if not result.get("skipped"):
|
|
66
|
+
rel = result.get("updated", path.name)
|
|
67
|
+
chunks = result.get("chunks", 0)
|
|
68
|
+
print(f"\r[dug] ✓ {rel} ({chunks} chunks reindexed) ", flush=True)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print(f"\r[dug] ✗ error reindexing {path.name}: {e}", flush=True)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def start_watch(root: Path | None = None) -> None:
|
|
74
|
+
"""Start the file watcher. Blocks until Ctrl+C."""
|
|
75
|
+
from .config import load_config
|
|
76
|
+
from .embeddings import get_embedder
|
|
77
|
+
|
|
78
|
+
from .config import find_repo_root
|
|
79
|
+
root = root or find_repo_root()
|
|
80
|
+
config = load_config()
|
|
81
|
+
|
|
82
|
+
valid_exts: set[str] = set()
|
|
83
|
+
for lang in config.get("languages", []):
|
|
84
|
+
valid_exts.update(LANG_EXTENSIONS.get(lang, []))
|
|
85
|
+
|
|
86
|
+
embedder = get_embedder(config)
|
|
87
|
+
handler = _DebounceHandler(root, config.get("ignore_paths", []), valid_exts, embedder)
|
|
88
|
+
|
|
89
|
+
observer = Observer()
|
|
90
|
+
observer.schedule(handler, str(root), recursive=True)
|
|
91
|
+
observer.start()
|
|
92
|
+
|
|
93
|
+
print(f"[dug] watching {root}")
|
|
94
|
+
print(f"[dug] debounce: {DEBOUNCE_SECONDS}s — Ctrl+C to stop")
|
|
95
|
+
try:
|
|
96
|
+
while True:
|
|
97
|
+
time.sleep(1)
|
|
98
|
+
except KeyboardInterrupt:
|
|
99
|
+
pass
|
|
100
|
+
finally:
|
|
101
|
+
observer.stop()
|
|
102
|
+
observer.join()
|
|
103
|
+
print("\n[dug] watcher stopped.")
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dug-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Dig into any bug with full codebase context — zero LLM calls
|
|
5
|
+
Project-URL: Homepage, https://github.com/ratishjain12/dug
|
|
6
|
+
Project-URL: Repository, https://github.com/ratishjain12/dug
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/ratishjain12/dug/issues
|
|
8
|
+
Author-email: Ratish Jain <ratishjain6@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: claude,cli,code-search,debugging,developer-tools
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Requires-Dist: click
|
|
23
|
+
Requires-Dist: lancedb
|
|
24
|
+
Requires-Dist: networkx
|
|
25
|
+
Requires-Dist: tree-sitter-java
|
|
26
|
+
Requires-Dist: tree-sitter-javascript
|
|
27
|
+
Requires-Dist: tree-sitter-python
|
|
28
|
+
Requires-Dist: tree-sitter-typescript>=0.23.2
|
|
29
|
+
Requires-Dist: tree-sitter>=0.22
|
|
30
|
+
Requires-Dist: watchdog
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: openai; extra == 'all'
|
|
33
|
+
Requires-Dist: sentence-transformers; extra == 'all'
|
|
34
|
+
Provides-Extra: local
|
|
35
|
+
Requires-Dist: sentence-transformers; extra == 'local'
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai; extra == 'openai'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# dug
|
|
41
|
+
|
|
42
|
+
**Dig into any bug with full codebase context — zero LLM calls.**
|
|
43
|
+
|
|
44
|
+
[](https://pypi.org/project/dug-cli/)
|
|
45
|
+
[](LICENSE)
|
|
46
|
+
[](https://pypi.org/project/dug-cli/)
|
|
47
|
+
|
|
48
|
+
`dug` takes a bug report or stack trace and generates a structured [Claude Code](https://claude.ai/code) prompt that includes the exact files, functions, and context needed to fix it — using grep, AST parsing, and a local vector index with **no API calls and no LLM required**.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
|
|
54
|
+
```sh
|
|
55
|
+
# Recommended
|
|
56
|
+
pipx install dug-cli
|
|
57
|
+
|
|
58
|
+
# macOS (Homebrew)
|
|
59
|
+
brew tap ratishjain12/dug
|
|
60
|
+
brew install dug-cli
|
|
61
|
+
|
|
62
|
+
# One-liner (Linux / macOS)
|
|
63
|
+
curl -fsSL https://raw.githubusercontent.com/ratishjain12/dug/main/install.sh | sh
|
|
64
|
+
|
|
65
|
+
# Inside a virtualenv
|
|
66
|
+
pip install dug-cli
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Quick start
|
|
72
|
+
|
|
73
|
+
```sh
|
|
74
|
+
# 1. Run once in your repo root to build the index
|
|
75
|
+
cd /your/project
|
|
76
|
+
dug init
|
|
77
|
+
|
|
78
|
+
# 2. Paste any bug report or stack trace
|
|
79
|
+
dug "NullPointerException in UserService.authenticate at line 42"
|
|
80
|
+
|
|
81
|
+
# dug prints a ready-to-paste Claude Code prompt with ranked file context
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
**Sample output:**
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
You are a senior engineer debugging this issue:
|
|
88
|
+
|
|
89
|
+
NullPointerException in UserService.authenticate at line 42
|
|
90
|
+
|
|
91
|
+
Relevant files (ranked by relevance):
|
|
92
|
+
|
|
93
|
+
1. src/auth/UserService.java:35
|
|
94
|
+
authenticate() — modified 2 commits ago
|
|
95
|
+
...
|
|
96
|
+
|
|
97
|
+
2. src/config/AppConfig.java:12
|
|
98
|
+
loadConfig() — error pattern match
|
|
99
|
+
...
|
|
100
|
+
|
|
101
|
+
[full function bodies + graph context follow]
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## How it works
|
|
107
|
+
|
|
108
|
+
`dug` builds a **local knowledge base** the first time you run `dug init`:
|
|
109
|
+
|
|
110
|
+
| Layer | What it builds | Used for |
|
|
111
|
+
|---|---|---|
|
|
112
|
+
| Structural graph | File → Symbol → Commit nodes (networkx) | Import chains, recent changes |
|
|
113
|
+
| Semantic index | Function embeddings in LanceDB (sentence-transformers) | Meaning-level matches |
|
|
114
|
+
| History log | Past bug→fix pairs | Learning from outcomes |
|
|
115
|
+
|
|
116
|
+
At query time, three signals are combined into a ranked list:
|
|
117
|
+
|
|
118
|
+
- **Structural score** — imports your error file, was modified in a related commit
|
|
119
|
+
- **Semantic score** — cosine similarity between bug text and function bodies
|
|
120
|
+
- **History boost** — similar past bugs pointed here
|
|
121
|
+
|
|
122
|
+
The index stays fresh via git hooks (`post-commit`, `post-checkout`) and an optional file watcher.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Commands
|
|
127
|
+
|
|
128
|
+
| Command | What it does |
|
|
129
|
+
|---|---|
|
|
130
|
+
| `dug init` | Index the current repo (builds graph + embeddings) |
|
|
131
|
+
| `dug "error text"` | Generate a Claude Code prompt for the bug |
|
|
132
|
+
| `dug update` | Re-index files changed since last commit |
|
|
133
|
+
| `dug watch` | Watch for file saves and re-index in real time |
|
|
134
|
+
| `dug stats` | Show index size (nodes, edges, chunks) |
|
|
135
|
+
| `dug config` | View / edit configuration |
|
|
136
|
+
| `dug feedback good` | Mark last query as helpful (improves future results) |
|
|
137
|
+
| `dug feedback bad` | Mark last query as unhelpful |
|
|
138
|
+
|
|
139
|
+
### Options
|
|
140
|
+
|
|
141
|
+
```sh
|
|
142
|
+
dug init --local # Use local embeddings (default, no API key needed)
|
|
143
|
+
dug init --openai # Use OpenAI text-embedding-3-small (needs OPENAI_API_KEY)
|
|
144
|
+
dug "error" --files 3 # Limit to top 3 files in prompt
|
|
145
|
+
dug "error" --no-history # Skip learning loop context
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Configuration
|
|
151
|
+
|
|
152
|
+
`dug init` creates `.dug/config.json` in the repo root. You can also edit it with `dug config set <key> <value>`.
|
|
153
|
+
|
|
154
|
+
```json
|
|
155
|
+
{
|
|
156
|
+
"embedding_mode": "local",
|
|
157
|
+
"languages": ["python", "java", "typescript", "javascript"],
|
|
158
|
+
"max_files_in_prompt": 5,
|
|
159
|
+
"git_history_depth": 50,
|
|
160
|
+
"exclude_test_files": true
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
`.dug/` is automatically added to `.gitignore` — it's machine-specific and never committed.
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Contributing
|
|
169
|
+
|
|
170
|
+
```sh
|
|
171
|
+
git clone https://github.com/ratishjain12/dug
|
|
172
|
+
cd dug
|
|
173
|
+
uv sync
|
|
174
|
+
uv run dug init # index the dug repo itself
|
|
175
|
+
uv run dug "your bug here"
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Requires Python 3.10+ and [uv](https://docs.astral.sh/uv/).
|