dug-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dug/__init__.py +0 -0
- dug/__main__.py +297 -0
- dug/chunker.py +137 -0
- dug/config.py +77 -0
- dug/embeddings.py +97 -0
- dug/git_context.py +56 -0
- dug/graph.py +423 -0
- dug/history.py +231 -0
- dug/hooks.py +112 -0
- dug/indexer.py +294 -0
- dug/prompt_builder.py +106 -0
- dug/retriever.py +249 -0
- dug/vector_store.py +79 -0
- dug/verifier.py +73 -0
- dug/watcher.py +103 -0
- dug_cli-0.1.0.dist-info/METADATA +178 -0
- dug_cli-0.1.0.dist-info/RECORD +20 -0
- dug_cli-0.1.0.dist-info/WHEEL +4 -0
- dug_cli-0.1.0.dist-info/entry_points.txt +2 -0
- dug_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
dug/__init__.py
ADDED
|
File without changes
|
dug/__main__.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from .config import load_config, save_config, set_config_value, get_dug_dir, DEFAULTS, find_repo_root
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DefaultToQueryGroup(click.Group):
|
|
10
|
+
"""Routes `dug "some error"` to the query command when first arg isn't a subcommand."""
|
|
11
|
+
|
|
12
|
+
def parse_args(self, ctx, args):
|
|
13
|
+
if args and not args[0].startswith("-") and args[0] not in self.commands:
|
|
14
|
+
args = ["query"] + args
|
|
15
|
+
return super().parse_args(ctx, args)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Helpers
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
LANG_EXTENSIONS = {
|
|
23
|
+
"python": [".py"],
|
|
24
|
+
"java": [".java"],
|
|
25
|
+
"typescript": [".ts", ".tsx"],
|
|
26
|
+
"javascript": [".js", ".jsx"],
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _detect_languages(root: Path) -> list[str]:
|
|
31
|
+
detected = []
|
|
32
|
+
for lang, exts in LANG_EXTENSIONS.items():
|
|
33
|
+
for ext in exts:
|
|
34
|
+
if any(root.rglob(f"*{ext}")):
|
|
35
|
+
detected.append(lang)
|
|
36
|
+
break
|
|
37
|
+
return detected or ["python"]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# CLI
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
@click.group(cls=DefaultToQueryGroup, invoke_without_command=True)
|
|
45
|
+
@click.pass_context
|
|
46
|
+
def cli(ctx):
|
|
47
|
+
"""dug — dig into any bug with full codebase context."""
|
|
48
|
+
if ctx.invoked_subcommand is None:
|
|
49
|
+
click.echo(ctx.get_help())
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@cli.command()
|
|
53
|
+
def init():
|
|
54
|
+
"""First-time setup — wizard + full index build."""
|
|
55
|
+
click.echo("\nWelcome to dug.\n")
|
|
56
|
+
|
|
57
|
+
# Embedding mode
|
|
58
|
+
click.echo("Embedding mode:")
|
|
59
|
+
click.echo(" 1. Local — no API key, runs on CPU (recommended)")
|
|
60
|
+
click.echo(" 2. OpenAI — needs API key, faster")
|
|
61
|
+
choice = click.prompt("\n>", default="1").strip()
|
|
62
|
+
|
|
63
|
+
cfg = load_config()
|
|
64
|
+
|
|
65
|
+
if choice == "2":
|
|
66
|
+
api_key = click.prompt("OpenAI API key").strip()
|
|
67
|
+
cfg["embedding_mode"] = "openai"
|
|
68
|
+
cfg["api_key"] = api_key
|
|
69
|
+
click.echo("\n✓ Using OpenAI embeddings.")
|
|
70
|
+
else:
|
|
71
|
+
cfg["embedding_mode"] = "local"
|
|
72
|
+
cfg["api_key"] = None
|
|
73
|
+
click.echo("\n✓ Using local embeddings. No API key needed.")
|
|
74
|
+
|
|
75
|
+
# Language detection
|
|
76
|
+
root = find_repo_root()
|
|
77
|
+
detected = _detect_languages(root)
|
|
78
|
+
click.echo(f"\nLanguages detected: {', '.join(detected)}")
|
|
79
|
+
cfg["languages"] = detected
|
|
80
|
+
|
|
81
|
+
# Ignore paths
|
|
82
|
+
click.echo(f"Ignore paths: {', '.join(cfg['ignore_paths'])}")
|
|
83
|
+
|
|
84
|
+
save_config(cfg)
|
|
85
|
+
click.echo(f"\nConfig saved to {get_dug_dir() / 'config.json'}")
|
|
86
|
+
|
|
87
|
+
# Build index
|
|
88
|
+
click.echo("\nStarting initial index...")
|
|
89
|
+
try:
|
|
90
|
+
from .indexer import run_init
|
|
91
|
+
from .embeddings import get_embedder
|
|
92
|
+
from .hooks import install_git_hooks, ensure_gitignore
|
|
93
|
+
gi_status = ensure_gitignore(root)
|
|
94
|
+
click.echo(f" .gitignore: {gi_status}")
|
|
95
|
+
embedder = get_embedder(cfg)
|
|
96
|
+
stats = run_init(root, embedder=embedder)
|
|
97
|
+
files = stats["nodes"].get("FILE", 0)
|
|
98
|
+
symbols = stats["nodes"].get("SYMBOL", 0)
|
|
99
|
+
commits = stats["nodes"].get("COMMIT", 0)
|
|
100
|
+
edges = stats["edges"]
|
|
101
|
+
chunks = stats.get("chunks", 0)
|
|
102
|
+
click.echo(f" FILE nodes: {files}")
|
|
103
|
+
click.echo(f" SYMBOL nodes: {symbols}")
|
|
104
|
+
click.echo(f" COMMIT nodes: {commits}")
|
|
105
|
+
click.echo(f" Total edges: {edges}")
|
|
106
|
+
click.echo(f" Chunks embedded: {chunks}")
|
|
107
|
+
|
|
108
|
+
# Install git hooks
|
|
109
|
+
hook_results = install_git_hooks(root)
|
|
110
|
+
if "error" not in hook_results:
|
|
111
|
+
click.echo(f"\n Git hooks:")
|
|
112
|
+
for hook, status in hook_results.items():
|
|
113
|
+
click.echo(f" {hook}: {status}")
|
|
114
|
+
|
|
115
|
+
click.echo("\n✓ dug is ready. Run: dug \"your error here\"")
|
|
116
|
+
except Exception as e:
|
|
117
|
+
click.echo(f"\n✗ Index failed: {e}", err=True)
|
|
118
|
+
sys.exit(1)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@cli.command(name="query")
|
|
122
|
+
@click.argument("bug_input")
|
|
123
|
+
def query(bug_input):
|
|
124
|
+
"""Query the index with a bug or stack trace."""
|
|
125
|
+
cfg = load_config()
|
|
126
|
+
dug_dir = get_dug_dir()
|
|
127
|
+
|
|
128
|
+
if not (dug_dir / "graph.json").exists():
|
|
129
|
+
click.echo("No index found. Run: dug init", err=True)
|
|
130
|
+
sys.exit(1)
|
|
131
|
+
|
|
132
|
+
from .graph import CodeGraph
|
|
133
|
+
from .embeddings import get_embedder
|
|
134
|
+
from .vector_store import get_or_create_table
|
|
135
|
+
from .retriever import hybrid_search
|
|
136
|
+
from .verifier import verify_files
|
|
137
|
+
from .prompt_builder import build_prompt
|
|
138
|
+
from .git_context import get_git_history
|
|
139
|
+
|
|
140
|
+
graph = CodeGraph()
|
|
141
|
+
graph.load(dug_dir / "graph.json")
|
|
142
|
+
|
|
143
|
+
embedder = get_embedder(cfg)
|
|
144
|
+
table = get_or_create_table(dug_dir / "embeddings", cfg.get("embedding_mode", "local"))
|
|
145
|
+
|
|
146
|
+
ranked, signals = hybrid_search(
|
|
147
|
+
embedder, graph, table, bug_input,
|
|
148
|
+
top_k=cfg.get("max_files_in_prompt", 5),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Verify candidates actually contain extracted symbols / bug words
|
|
152
|
+
root = find_repo_root()
|
|
153
|
+
verified_paths = verify_files(
|
|
154
|
+
[f.path for f in ranked], signals.get("symbols", []), root, bug_input
|
|
155
|
+
)
|
|
156
|
+
ranked = [f for f in ranked if f.path in verified_paths]
|
|
157
|
+
|
|
158
|
+
git_commits = get_git_history(root, depth=cfg.get("git_history_depth", 50))
|
|
159
|
+
|
|
160
|
+
prompt = build_prompt(bug_input, ranked, git_commits, signals)
|
|
161
|
+
click.echo(prompt)
|
|
162
|
+
|
|
163
|
+
# Save for `dug solved`
|
|
164
|
+
from .history import save_last_query
|
|
165
|
+
save_last_query(bug_input, [f.path for f in ranked], signals)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@cli.command()
|
|
169
|
+
@click.option("--changed-only", is_flag=True, help="Reindex only git-changed files.")
|
|
170
|
+
@click.option("--branch-switch", is_flag=True, hidden=True)
|
|
171
|
+
@click.option("--from", "from_ref", default="HEAD~1", hidden=True)
|
|
172
|
+
@click.option("--to", "to_ref", default="HEAD", hidden=True)
|
|
173
|
+
def update(changed_only, branch_switch, from_ref, to_ref):
|
|
174
|
+
"""Refresh the graph and index."""
|
|
175
|
+
root = find_repo_root()
|
|
176
|
+
try:
|
|
177
|
+
if changed_only or branch_switch:
|
|
178
|
+
from .indexer import update_changed_files
|
|
179
|
+
result = update_changed_files(
|
|
180
|
+
root, from_ref=from_ref, to_ref=to_ref
|
|
181
|
+
)
|
|
182
|
+
pruned = result.get("pruned", [])
|
|
183
|
+
updated = result.get("updated", [])
|
|
184
|
+
skipped = result.get("skipped", [])
|
|
185
|
+
if pruned:
|
|
186
|
+
click.echo(f" Pruned {len(pruned)} deleted file(s).")
|
|
187
|
+
click.echo(f"✓ Updated {len(updated)} file(s), skipped {len(skipped)} unchanged.")
|
|
188
|
+
else:
|
|
189
|
+
click.echo("Rebuilding full index...")
|
|
190
|
+
from .indexer import run_init
|
|
191
|
+
from .hooks import ensure_gitignore
|
|
192
|
+
ensure_gitignore(root)
|
|
193
|
+
stats = run_init(root)
|
|
194
|
+
click.echo(f"✓ Done — {stats['nodes'].get('FILE', 0)} files, "
|
|
195
|
+
f"{stats.get('chunks', 0)} chunks.")
|
|
196
|
+
except Exception as e:
|
|
197
|
+
click.echo(f"✗ Update failed: {e}", err=True)
|
|
198
|
+
sys.exit(1)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@cli.command()
|
|
202
|
+
def watch():
|
|
203
|
+
"""Start background file watcher — reindexes on save (1.5s debounce)."""
|
|
204
|
+
dug_dir = get_dug_dir()
|
|
205
|
+
if not (dug_dir / "graph.json").exists():
|
|
206
|
+
click.echo("No index found. Run: dug init first.", err=True)
|
|
207
|
+
sys.exit(1)
|
|
208
|
+
from .watcher import start_watch
|
|
209
|
+
start_watch(Path.cwd())
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@cli.command()
|
|
213
|
+
@click.option("--files", "-f", default=None,
|
|
214
|
+
help="Comma-separated file paths that contained the fix.")
|
|
215
|
+
def solved(files):
|
|
216
|
+
"""Record which files fixed the last bug — improves future rankings."""
|
|
217
|
+
from .history import load_last_query, record_resolved
|
|
218
|
+
|
|
219
|
+
last = load_last_query()
|
|
220
|
+
if not last:
|
|
221
|
+
click.echo("No recent query found. Run: dug \"your error\" first.", err=True)
|
|
222
|
+
sys.exit(1)
|
|
223
|
+
|
|
224
|
+
click.echo(f"\nLast query: \"{last['bug_input']}\"")
|
|
225
|
+
click.echo(f"Suggested files were:")
|
|
226
|
+
for f in last.get("ranked_files", []):
|
|
227
|
+
click.echo(f" - {f}")
|
|
228
|
+
|
|
229
|
+
if files:
|
|
230
|
+
resolved = [f.strip() for f in files.split(",") if f.strip()]
|
|
231
|
+
else:
|
|
232
|
+
click.echo("\nWhich files actually contained the bug? (comma-separated paths)")
|
|
233
|
+
click.echo("Press Enter to accept the suggestions above, or type new paths.")
|
|
234
|
+
raw = click.prompt(">", default=",".join(last.get("ranked_files", [])))
|
|
235
|
+
resolved = [f.strip() for f in raw.split(",") if f.strip()]
|
|
236
|
+
|
|
237
|
+
if not resolved:
|
|
238
|
+
click.echo("No files recorded.", err=True)
|
|
239
|
+
sys.exit(1)
|
|
240
|
+
|
|
241
|
+
record_resolved(last["bug_input"], resolved, last.get("signals", {}))
|
|
242
|
+
|
|
243
|
+
click.echo(f"\n✓ Saved. These files will rank higher for similar errors next time:")
|
|
244
|
+
for f in resolved:
|
|
245
|
+
click.echo(f" - {f}")
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@cli.command()
|
|
249
|
+
def stats():
|
|
250
|
+
"""Print graph stats."""
|
|
251
|
+
dug_dir = get_dug_dir()
|
|
252
|
+
if not (dug_dir / "graph.json").exists():
|
|
253
|
+
click.echo("No index found. Run: dug init", err=True)
|
|
254
|
+
sys.exit(1)
|
|
255
|
+
|
|
256
|
+
from .graph import CodeGraph
|
|
257
|
+
from .vector_store import get_or_create_table
|
|
258
|
+
cfg = load_config()
|
|
259
|
+
graph = CodeGraph()
|
|
260
|
+
graph.load(dug_dir / "graph.json")
|
|
261
|
+
s = graph.stats()
|
|
262
|
+
click.echo("\nGraph stats:")
|
|
263
|
+
for kind, count in s["nodes"].items():
|
|
264
|
+
click.echo(f" {kind}: {count}")
|
|
265
|
+
click.echo(f" Edges: {s['edges']}")
|
|
266
|
+
try:
|
|
267
|
+
table = get_or_create_table(dug_dir / "embeddings", cfg.get("embedding_mode", "local"))
|
|
268
|
+
click.echo(f" Chunks (embedded): {table.count_rows()}")
|
|
269
|
+
except Exception:
|
|
270
|
+
click.echo(" Chunks (embedded): n/a")
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
@cli.group()
|
|
274
|
+
def config():
|
|
275
|
+
"""Manage dug configuration."""
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
@config.command(name="set")
|
|
279
|
+
@click.argument("key")
|
|
280
|
+
@click.argument("value")
|
|
281
|
+
def config_set(key, value):
|
|
282
|
+
"""Set a config value. Example: dug config set embedding-mode openai"""
|
|
283
|
+
key = key.replace("-", "_")
|
|
284
|
+
set_config_value(key, value)
|
|
285
|
+
click.echo(f"✓ {key} = {value}")
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@config.command(name="show")
|
|
289
|
+
def config_show():
|
|
290
|
+
"""Show current config."""
|
|
291
|
+
cfg = load_config()
|
|
292
|
+
import json
|
|
293
|
+
click.echo(json.dumps(cfg, indent=2))
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
if __name__ == "__main__":
|
|
297
|
+
cli()
|
dug/chunker.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Tree-sitter based function/method extractor — produces chunks for embedding."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from tree_sitter import Language, Parser, Node
|
|
10
|
+
|
|
11
|
+
import tree_sitter_python as tspython
|
|
12
|
+
import tree_sitter_java as tsjava
|
|
13
|
+
import tree_sitter_javascript as tsjavascript
|
|
14
|
+
import tree_sitter_typescript as tstypescript
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Language parsers
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
_LANGUAGES: dict[str, Language] = {
|
|
21
|
+
"python": Language(tspython.language()),
|
|
22
|
+
"java": Language(tsjava.language()),
|
|
23
|
+
"javascript": Language(tsjavascript.language()),
|
|
24
|
+
"typescript": Language(tstypescript.language_typescript()),
|
|
25
|
+
"tsx": Language(tstypescript.language_tsx()),
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
_EXT_TO_LANG: dict[str, str] = {
|
|
29
|
+
".py": "python",
|
|
30
|
+
".java": "java",
|
|
31
|
+
".js": "javascript",
|
|
32
|
+
".jsx": "javascript",
|
|
33
|
+
".ts": "typescript",
|
|
34
|
+
".tsx": "tsx",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
# Node types that represent callable units worth embedding
|
|
38
|
+
_FUNCTION_NODE_TYPES: dict[str, set[str]] = {
|
|
39
|
+
"python": {"function_definition", "decorated_definition"},
|
|
40
|
+
"java": {"method_declaration", "constructor_declaration"},
|
|
41
|
+
"javascript": {"function_declaration", "method_definition", "arrow_function",
|
|
42
|
+
"function_expression"},
|
|
43
|
+
"typescript": {"function_declaration", "method_definition", "arrow_function",
|
|
44
|
+
"function_expression", "method_signature"},
|
|
45
|
+
"tsx": {"function_declaration", "method_definition", "arrow_function",
|
|
46
|
+
"function_expression", "method_signature"},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
MIN_CHUNK_CHARS = 30
|
|
50
|
+
MAX_CHUNK_CHARS = 8000
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
# Chunk dataclass
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class Chunk:
|
|
59
|
+
chunk_id: str # md5(file_path + function_name + str(start_line))
|
|
60
|
+
file_path: str # relative to repo root
|
|
61
|
+
function_name: str
|
|
62
|
+
start_line: int # 1-indexed
|
|
63
|
+
end_line: int
|
|
64
|
+
code: str
|
|
65
|
+
language: str
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _make_chunk_id(file_path: str, name: str, start_line: int) -> str:
|
|
69
|
+
key = f"{file_path}:{name}:{start_line}"
|
|
70
|
+
return hashlib.md5(key.encode()).hexdigest()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# AST walker
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def _get_function_name(node: Node, code_bytes: bytes, language: str) -> str:
|
|
78
|
+
"""Extract the best available name for a function/method node."""
|
|
79
|
+
# decorated_definition wraps the actual function — recurse one level
|
|
80
|
+
if node.type == "decorated_definition":
|
|
81
|
+
for child in node.children:
|
|
82
|
+
if child.type == "function_definition":
|
|
83
|
+
return _get_function_name(child, code_bytes, language)
|
|
84
|
+
|
|
85
|
+
name_node = node.child_by_field_name("name")
|
|
86
|
+
if name_node:
|
|
87
|
+
return code_bytes[name_node.start_byte:name_node.end_byte].decode(errors="replace")
|
|
88
|
+
|
|
89
|
+
# arrow functions often have no name — use parent context if available
|
|
90
|
+
return "<anonymous>"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _walk(node: Node, code_bytes: bytes, language: str,
|
|
94
|
+
target_types: set[str], results: list[Chunk], file_path: str) -> None:
|
|
95
|
+
if node.type in target_types:
|
|
96
|
+
name = _get_function_name(node, code_bytes, language)
|
|
97
|
+
code = code_bytes[node.start_byte:node.end_byte].decode(errors="replace")
|
|
98
|
+
if MIN_CHUNK_CHARS <= len(code) <= MAX_CHUNK_CHARS and name != "<anonymous>":
|
|
99
|
+
results.append(Chunk(
|
|
100
|
+
chunk_id=_make_chunk_id(file_path, name, node.start_point[0] + 1),
|
|
101
|
+
file_path=file_path,
|
|
102
|
+
function_name=name,
|
|
103
|
+
start_line=node.start_point[0] + 1,
|
|
104
|
+
end_line=node.end_point[0] + 1,
|
|
105
|
+
code=code,
|
|
106
|
+
language=language,
|
|
107
|
+
))
|
|
108
|
+
# still recurse — nested functions/methods should also be extracted
|
|
109
|
+
for child in node.children:
|
|
110
|
+
_walk(child, code_bytes, language, target_types, results, file_path)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
# Public API
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
def extract_chunks(file_path: Path, root: Path) -> list[Chunk]:
|
|
118
|
+
"""Parse `file_path` with tree-sitter and return one Chunk per function/method."""
|
|
119
|
+
lang = _EXT_TO_LANG.get(file_path.suffix)
|
|
120
|
+
if lang is None or lang not in _LANGUAGES:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
language = _LANGUAGES[lang]
|
|
124
|
+
parser = Parser(language)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
code_bytes = file_path.read_bytes()
|
|
128
|
+
except OSError:
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
tree = parser.parse(code_bytes)
|
|
132
|
+
rel = str(file_path.relative_to(root))
|
|
133
|
+
target_types = _FUNCTION_NODE_TYPES.get(lang, set())
|
|
134
|
+
|
|
135
|
+
results: list[Chunk] = []
|
|
136
|
+
_walk(tree.root_node, code_bytes, lang, target_types, results, rel)
|
|
137
|
+
return results
|
dug/config.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import subprocess
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
DEFAULTS = {
|
|
6
|
+
"embedding_mode": "local",
|
|
7
|
+
"api_key": None,
|
|
8
|
+
"languages": ["python", "java", "typescript", "javascript"],
|
|
9
|
+
"ignore_paths": ["node_modules", ".git", "build", "dist", "vendor", "__pycache__", ".venv", "venv", ".tox", "eggs", ".eggs"],
|
|
10
|
+
"git_history_depth": 50,
|
|
11
|
+
"max_files_in_prompt": 5,
|
|
12
|
+
"exclude_test_files": True,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def find_repo_root() -> Path:
|
|
17
|
+
"""
|
|
18
|
+
Walk up from cwd looking for .git/. Falls back to cwd if not in a git repo.
|
|
19
|
+
Also accepts a repo root that already has .dug/ (supports non-git projects
|
|
20
|
+
that ran dug init manually).
|
|
21
|
+
"""
|
|
22
|
+
try:
|
|
23
|
+
result = subprocess.run(
|
|
24
|
+
["git", "rev-parse", "--show-toplevel"],
|
|
25
|
+
capture_output=True,
|
|
26
|
+
text=True,
|
|
27
|
+
cwd=Path.cwd(),
|
|
28
|
+
)
|
|
29
|
+
if result.returncode == 0:
|
|
30
|
+
return Path(result.stdout.strip())
|
|
31
|
+
except FileNotFoundError:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
# Fallback: walk up looking for an existing .dug/ directory
|
|
35
|
+
current = Path.cwd()
|
|
36
|
+
for parent in [current, *current.parents]:
|
|
37
|
+
if (parent / ".dug").exists():
|
|
38
|
+
return parent
|
|
39
|
+
|
|
40
|
+
return Path.cwd()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_dug_dir() -> Path:
|
|
44
|
+
return find_repo_root() / ".dug"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_config_path() -> Path:
|
|
48
|
+
return get_dug_dir() / "config.json"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def load_config() -> dict:
|
|
52
|
+
path = get_config_path()
|
|
53
|
+
if not path.exists():
|
|
54
|
+
return dict(DEFAULTS)
|
|
55
|
+
with open(path) as f:
|
|
56
|
+
data = json.load(f)
|
|
57
|
+
return {**DEFAULTS, **data}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def save_config(cfg: dict) -> None:
|
|
61
|
+
path = get_config_path()
|
|
62
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
with open(path, "w") as f:
|
|
64
|
+
json.dump(cfg, f, indent=2)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def set_config_value(key: str, value: str) -> None:
|
|
68
|
+
cfg = load_config()
|
|
69
|
+
# coerce booleans and nulls
|
|
70
|
+
if value.lower() == "null":
|
|
71
|
+
cfg[key] = None
|
|
72
|
+
elif value.lower() in ("true", "false"):
|
|
73
|
+
cfg[key] = value.lower() == "true"
|
|
74
|
+
else:
|
|
75
|
+
cfg[key] = value
|
|
76
|
+
save_config(cfg)
|
|
77
|
+
# phase 4 test comment
|
dug/embeddings.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import warnings
|
|
8
|
+
|
|
9
|
+
# Suppress HuggingFace noise before any library import
|
|
10
|
+
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
|
11
|
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
|
12
|
+
os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
|
|
13
|
+
os.environ.setdefault("HF_HUB_VERBOSITY", "error")
|
|
14
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="huggingface_hub")
|
|
15
|
+
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
|
|
16
|
+
|
|
17
|
+
for _noisy in ("sentence_transformers", "huggingface_hub", "transformers",
|
|
18
|
+
"torch", "tokenizers"):
|
|
19
|
+
logging.getLogger(_noisy).setLevel(logging.ERROR)
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Dependency installer
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
_LOCAL_DEPS = ["sentence-transformers"]
|
|
26
|
+
_OPENAI_DEPS = ["openai"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _ensure_installed(packages: list[str], label: str) -> None:
|
|
30
|
+
"""Check if packages are importable; pip-install them if not."""
|
|
31
|
+
import importlib
|
|
32
|
+
missing = []
|
|
33
|
+
for pkg in packages:
|
|
34
|
+
module = pkg.replace("-", "_").split("[")[0]
|
|
35
|
+
try:
|
|
36
|
+
importlib.import_module(module)
|
|
37
|
+
except ImportError:
|
|
38
|
+
missing.append(pkg)
|
|
39
|
+
|
|
40
|
+
if not missing:
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
print(f"\n[dug] {label} dependencies not found: {', '.join(missing)}")
|
|
44
|
+
print(f"[dug] Installing (one-time download)...\n")
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
subprocess.check_call(
|
|
48
|
+
[sys.executable, "-m", "pip", "install", "--quiet", *missing],
|
|
49
|
+
stdout=sys.stdout,
|
|
50
|
+
stderr=sys.stderr,
|
|
51
|
+
)
|
|
52
|
+
print(f"\n[dug] ✓ {label} dependencies installed.\n")
|
|
53
|
+
except subprocess.CalledProcessError:
|
|
54
|
+
print(f"\n[dug] ✗ Auto-install failed. Run manually:")
|
|
55
|
+
print(f" pip install {' '.join(missing)}")
|
|
56
|
+
sys.exit(1)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# Embedders
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
class LocalEmbedder:
|
|
64
|
+
def __init__(self):
|
|
65
|
+
_ensure_installed(_LOCAL_DEPS, "Local embedding")
|
|
66
|
+
from sentence_transformers import SentenceTransformer
|
|
67
|
+
self.model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
68
|
+
|
|
69
|
+
def embed(self, text: str) -> list[float]:
|
|
70
|
+
return self.model.encode(text).tolist()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class OpenAIEmbedder:
|
|
74
|
+
def __init__(self, api_key: str):
|
|
75
|
+
_ensure_installed(_OPENAI_DEPS, "OpenAI")
|
|
76
|
+
from openai import OpenAI
|
|
77
|
+
self.client = OpenAI(api_key=api_key)
|
|
78
|
+
|
|
79
|
+
def embed(self, text: str) -> list[float]:
|
|
80
|
+
response = self.client.embeddings.create(
|
|
81
|
+
model="text-embedding-3-small",
|
|
82
|
+
input=text,
|
|
83
|
+
)
|
|
84
|
+
return response.data[0].embedding
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
_cache: dict = {}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_embedder(config: dict) -> LocalEmbedder | OpenAIEmbedder:
|
|
91
|
+
mode = config.get("embedding_mode", "local")
|
|
92
|
+
if mode not in _cache:
|
|
93
|
+
if mode == "openai":
|
|
94
|
+
_cache[mode] = OpenAIEmbedder(api_key=config["api_key"])
|
|
95
|
+
else:
|
|
96
|
+
_cache[mode] = LocalEmbedder()
|
|
97
|
+
return _cache[mode]
|
dug/git_context.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Commit:
|
|
9
|
+
hash: str
|
|
10
|
+
message: str
|
|
11
|
+
timestamp: datetime
|
|
12
|
+
files_touched: list[str] = field(default_factory=list)
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def days_ago(self) -> int:
|
|
16
|
+
delta = datetime.now(timezone.utc) - self.timestamp.astimezone(timezone.utc)
|
|
17
|
+
return delta.days
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_git_history(root: Path, depth: int = 50) -> list[Commit]:
|
|
21
|
+
try:
|
|
22
|
+
result = subprocess.run(
|
|
23
|
+
["git", "log", "--name-only", "--format=%H|%s|%aI", f"-n{depth}"],
|
|
24
|
+
capture_output=True,
|
|
25
|
+
text=True,
|
|
26
|
+
cwd=root,
|
|
27
|
+
)
|
|
28
|
+
except FileNotFoundError:
|
|
29
|
+
return []
|
|
30
|
+
|
|
31
|
+
if result.returncode != 0:
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
commits: list[Commit] = []
|
|
35
|
+
current: Commit | None = None
|
|
36
|
+
|
|
37
|
+
for line in result.stdout.splitlines():
|
|
38
|
+
line = line.strip()
|
|
39
|
+
if not line:
|
|
40
|
+
continue
|
|
41
|
+
if "|" in line and len(line.split("|")) >= 3:
|
|
42
|
+
parts = line.split("|", 2)
|
|
43
|
+
try:
|
|
44
|
+
ts = datetime.fromisoformat(parts[2])
|
|
45
|
+
except ValueError:
|
|
46
|
+
ts = datetime.now(timezone.utc)
|
|
47
|
+
if current:
|
|
48
|
+
commits.append(current)
|
|
49
|
+
current = Commit(hash=parts[0], message=parts[1], timestamp=ts)
|
|
50
|
+
elif current is not None:
|
|
51
|
+
current.files_touched.append(line)
|
|
52
|
+
|
|
53
|
+
if current:
|
|
54
|
+
commits.append(current)
|
|
55
|
+
|
|
56
|
+
return commits
|