askmycode 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- askmycode/__init__.py +1 -0
- askmycode/cli.py +324 -0
- askmycode/config.py +56 -0
- askmycode/embedder.py +26 -0
- askmycode/indexer.py +180 -0
- askmycode/llm.py +177 -0
- askmycode/manifest.py +34 -0
- askmycode/project_config.py +33 -0
- askmycode/reranker.py +28 -0
- askmycode/retriever.py +35 -0
- askmycode/store.py +54 -0
- askmycode-0.1.0.dist-info/METADATA +287 -0
- askmycode-0.1.0.dist-info/RECORD +15 -0
- askmycode-0.1.0.dist-info/WHEEL +4 -0
- askmycode-0.1.0.dist-info/entry_points.txt +2 -0
askmycode/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
askmycode/cli.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import typer
|
|
5
|
+
from google.genai.errors import ClientError, ServerError
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
8
|
+
|
|
9
|
+
from google.genai import types as genai_types
|
|
10
|
+
|
|
11
|
+
from askmycode import config, indexer, llm, manifest, project_config, retriever, store
|
|
12
|
+
|
|
13
|
+
app = typer.Typer(
|
|
14
|
+
name="askmycode",
|
|
15
|
+
help="Ask questions about any codebase using AI — your code stays private.",
|
|
16
|
+
add_completion=False,
|
|
17
|
+
no_args_is_help=True,
|
|
18
|
+
)
|
|
19
|
+
console = Console()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@app.command()
|
|
23
|
+
def index(
|
|
24
|
+
path: Path = typer.Argument(Path("."), help="Path to the codebase to index"),
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Scan and index a codebase so you can ask questions about it."""
|
|
27
|
+
path = path.resolve()
|
|
28
|
+
if not path.exists():
|
|
29
|
+
console.print(f"[red]Path does not exist:[/red] {path}")
|
|
30
|
+
raise typer.Exit(1)
|
|
31
|
+
|
|
32
|
+
with Progress(
|
|
33
|
+
SpinnerColumn(),
|
|
34
|
+
TextColumn("[progress.description]{task.description}"),
|
|
35
|
+
console=console,
|
|
36
|
+
transient=True,
|
|
37
|
+
) as progress:
|
|
38
|
+
task = progress.add_task("Starting...", total=None)
|
|
39
|
+
|
|
40
|
+
def on_progress(msg: str) -> None:
|
|
41
|
+
progress.update(task, description=msg)
|
|
42
|
+
|
|
43
|
+
n_files, n_chunks = indexer.index_directory(path, progress_callback=on_progress)
|
|
44
|
+
|
|
45
|
+
if n_files == 0:
|
|
46
|
+
console.print("[yellow]No supported files found.[/yellow]")
|
|
47
|
+
raise typer.Exit(0)
|
|
48
|
+
|
|
49
|
+
console.print(f"[green]Indexed {n_files} files → {n_chunks} chunks[/green] (.askmycode/)")
|
|
50
|
+
console.print('[dim]Now run: askmycode ask "your question"[/dim]')
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@app.command()
|
|
54
|
+
def ask(
|
|
55
|
+
question: str = typer.Argument(..., help="Question about your codebase"),
|
|
56
|
+
project: Path = typer.Option(Path("."), "--project", "-p", help="Path to the indexed project"),
|
|
57
|
+
n_results: int = typer.Option(8, "--results", "-n", help="Chunks to retrieve"),
|
|
58
|
+
sources: bool = typer.Option(False, "--sources", "-s", help="Show source files used to answer"),
|
|
59
|
+
provider: str = typer.Option("", "--provider", help="LLM provider: gemini, openai, anthropic, ollama"),
|
|
60
|
+
model: str = typer.Option("", "--model", help="Model name override"),
|
|
61
|
+
) -> None:
|
|
62
|
+
"""Ask a question about your indexed codebase."""
|
|
63
|
+
project = project.resolve()
|
|
64
|
+
pcfg = project_config.load(project)
|
|
65
|
+
provider = provider or pcfg["provider"] or config.get_provider()
|
|
66
|
+
model = model or pcfg["model"] or config.get_model()
|
|
67
|
+
n_results = n_results if n_results != 8 else pcfg["n_results"]
|
|
68
|
+
|
|
69
|
+
api_key = config.get_api_key(provider)
|
|
70
|
+
if not api_key and provider != "ollama":
|
|
71
|
+
console.print(f"[red]No API key found for provider '{provider}'.[/red]")
|
|
72
|
+
console.print(f" [bold]export {provider.upper()}_API_KEY=your-key[/bold]")
|
|
73
|
+
console.print(" [bold]askmycode config set-key YOUR_KEY[/bold]")
|
|
74
|
+
raise typer.Exit(1)
|
|
75
|
+
|
|
76
|
+
if not (project / ".askmycode").exists():
|
|
77
|
+
console.print(f"[red]No index found.[/red] Run first:")
|
|
78
|
+
console.print(f" [bold]askmycode index {project}[/bold]")
|
|
79
|
+
raise typer.Exit(1)
|
|
80
|
+
|
|
81
|
+
with console.status("Searching codebase..."):
|
|
82
|
+
context, metadatas = retriever.retrieve(question, project, n_results=n_results)
|
|
83
|
+
|
|
84
|
+
if not context:
|
|
85
|
+
console.print("[yellow]No relevant code found for that question.[/yellow]")
|
|
86
|
+
raise typer.Exit(0)
|
|
87
|
+
|
|
88
|
+
file_tree = llm.build_file_tree(list(manifest.load(project).keys()))
|
|
89
|
+
|
|
90
|
+
console.print()
|
|
91
|
+
try:
|
|
92
|
+
for chunk in llm.answer_stream(
|
|
93
|
+
question, context, api_key or "", file_tree=file_tree,
|
|
94
|
+
provider=provider, model=model,
|
|
95
|
+
):
|
|
96
|
+
console.print(chunk, end="")
|
|
97
|
+
console.print("\n")
|
|
98
|
+
except ClientError as e:
|
|
99
|
+
if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e):
|
|
100
|
+
console.print("\n[red]Quota exceeded.[/red] Your API key has no free tier quota.")
|
|
101
|
+
console.print("Get a fresh key at [bold]aistudio.google.com[/bold] → Get API key → Create API key in new project")
|
|
102
|
+
console.print("Then run: [bold]askmycode config set-key YOUR_NEW_KEY[/bold]")
|
|
103
|
+
elif "401" in str(e) or "API_KEY_INVALID" in str(e):
|
|
104
|
+
console.print("\n[red]Invalid API key.[/red] Run: [bold]askmycode config set-key YOUR_KEY[/bold]")
|
|
105
|
+
else:
|
|
106
|
+
console.print(f"\n[red]API error:[/red] {e}")
|
|
107
|
+
raise typer.Exit(1)
|
|
108
|
+
except ServerError as e:
|
|
109
|
+
console.print(f"\n[red]Gemini server error:[/red] {e}")
|
|
110
|
+
raise typer.Exit(1)
|
|
111
|
+
|
|
112
|
+
if sources and metadatas:
|
|
113
|
+
seen: set = set()
|
|
114
|
+
console.print("[dim]─── Sources ────────────────────────────[/dim]")
|
|
115
|
+
for m in metadatas:
|
|
116
|
+
key = (m.get("file_path", ""), m.get("start_line", ""))
|
|
117
|
+
if key in seen:
|
|
118
|
+
continue
|
|
119
|
+
seen.add(key)
|
|
120
|
+
console.print(f"[dim] {m.get('file_path')}:{m.get('start_line')}–{m.get('end_line')}[/dim]")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@app.command()
|
|
124
|
+
def init(
|
|
125
|
+
path: Path = typer.Argument(Path("."), help="Project root to initialise"),
|
|
126
|
+
) -> None:
|
|
127
|
+
"""Set up askmycode for a project: update .gitignore and suggest first steps."""
|
|
128
|
+
path = path.resolve()
|
|
129
|
+
|
|
130
|
+
# Detect project type from marker files
|
|
131
|
+
markers: dict[str, list[str]] = {
|
|
132
|
+
"Python": ["pyproject.toml", "setup.py", "requirements.txt"],
|
|
133
|
+
"Node/TS": ["package.json"],
|
|
134
|
+
"Go": ["go.mod"],
|
|
135
|
+
"Rust": ["Cargo.toml"],
|
|
136
|
+
"Java/Kotlin":["pom.xml", "build.gradle", "build.gradle.kts"],
|
|
137
|
+
"Ruby": ["Gemfile"],
|
|
138
|
+
}
|
|
139
|
+
detected = "Unknown"
|
|
140
|
+
for lang, files in markers.items():
|
|
141
|
+
if any((path / f).exists() for f in files):
|
|
142
|
+
detected = lang
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
# Ensure .askmycode/ is in .gitignore
|
|
146
|
+
gitignore = path / ".gitignore"
|
|
147
|
+
entry = ".askmycode/"
|
|
148
|
+
if gitignore.exists():
|
|
149
|
+
content = gitignore.read_text()
|
|
150
|
+
if entry not in content:
|
|
151
|
+
gitignore.write_text(content.rstrip() + f"\n{entry}\n")
|
|
152
|
+
console.print(f"[green]Added {entry} to .gitignore[/green]")
|
|
153
|
+
else:
|
|
154
|
+
console.print(f"[dim]{entry} already in .gitignore[/dim]")
|
|
155
|
+
else:
|
|
156
|
+
gitignore.write_text(f"{entry}\n")
|
|
157
|
+
console.print(f"[green]Created .gitignore with {entry}[/green]")
|
|
158
|
+
|
|
159
|
+
console.print(f"\n[bold]Detected project type:[/bold] {detected}")
|
|
160
|
+
|
|
161
|
+
# Suggested starter questions per language
|
|
162
|
+
suggestions: dict[str, list[str]] = {
|
|
163
|
+
"Python": ["how is the project structured?", "where is the entry point?", "how are dependencies managed?"],
|
|
164
|
+
"Node/TS": ["what does the main script do?", "how is routing handled?", "what testing framework is used?"],
|
|
165
|
+
"Go": ["what is the main package?", "how is error handling done?", "where are the HTTP handlers?"],
|
|
166
|
+
"Rust": ["what does the main function do?", "how are errors propagated?", "what external crates are used?"],
|
|
167
|
+
"Java/Kotlin":["what is the application entry point?", "how is the project layered?", "where are the models defined?"],
|
|
168
|
+
"Ruby": ["what does the application do?", "how is routing set up?", "where is business logic?"],
|
|
169
|
+
"Unknown": ["how is the project structured?", "what does this codebase do?", "where is the main entry point?"],
|
|
170
|
+
}
|
|
171
|
+
console.print("\n[bold]Suggested first questions:[/bold]")
|
|
172
|
+
for q in suggestions.get(detected, suggestions["Unknown"]):
|
|
173
|
+
console.print(f' askmycode ask "{q}"')
|
|
174
|
+
|
|
175
|
+
console.print("\n[bold]Next steps:[/bold]")
|
|
176
|
+
console.print(f" 1. askmycode index {path}")
|
|
177
|
+
console.print(f' 2. askmycode ask "how is the project structured?"')
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@app.command()
|
|
181
|
+
def chat(
|
|
182
|
+
project: Path = typer.Option(Path("."), "--project", "-p", help="Path to the indexed project"),
|
|
183
|
+
n_results: int = typer.Option(8, "--results", "-n", help="Chunks to retrieve per turn"),
|
|
184
|
+
provider: str = typer.Option("", "--provider", help="LLM provider: gemini, openai, anthropic, ollama"),
|
|
185
|
+
model: str = typer.Option("", "--model", help="Model name override"),
|
|
186
|
+
) -> None:
|
|
187
|
+
"""Start an interactive multi-turn chat session about your codebase."""
|
|
188
|
+
project = project.resolve()
|
|
189
|
+
pcfg = project_config.load(project)
|
|
190
|
+
provider = provider or pcfg["provider"] or config.get_provider()
|
|
191
|
+
model = model or pcfg["model"] or config.get_model()
|
|
192
|
+
n_results = n_results if n_results != 8 else pcfg["n_results"]
|
|
193
|
+
|
|
194
|
+
api_key = config.get_api_key(provider)
|
|
195
|
+
if not api_key and provider != "ollama":
|
|
196
|
+
console.print(f"[red]No API key found for provider '{provider}'.[/red] Run: [bold]askmycode config set-key YOUR_KEY[/bold]")
|
|
197
|
+
raise typer.Exit(1)
|
|
198
|
+
|
|
199
|
+
if not (project / ".askmycode").exists():
|
|
200
|
+
console.print(f"[red]No index found.[/red] Run: [bold]askmycode index {project}[/bold]")
|
|
201
|
+
raise typer.Exit(1)
|
|
202
|
+
|
|
203
|
+
console.print("[bold]askmycode chat[/bold] — type your question, [dim]exit[/dim] or Ctrl+C to quit.\n")
|
|
204
|
+
|
|
205
|
+
history: list[genai_types.Content] = []
|
|
206
|
+
file_tree = llm.build_file_tree(list(manifest.load(project).keys()))
|
|
207
|
+
|
|
208
|
+
while True:
|
|
209
|
+
try:
|
|
210
|
+
question = console.input("[bold cyan]You:[/bold cyan] ").strip()
|
|
211
|
+
except (KeyboardInterrupt, EOFError):
|
|
212
|
+
console.print("\n[dim]Bye.[/dim]")
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
if not question:
|
|
216
|
+
continue
|
|
217
|
+
if question.lower() in ("exit", "quit", "q"):
|
|
218
|
+
console.print("[dim]Bye.[/dim]")
|
|
219
|
+
break
|
|
220
|
+
|
|
221
|
+
with console.status("Searching codebase..."):
|
|
222
|
+
context, _ = retriever.retrieve(question, project, n_results=n_results)
|
|
223
|
+
|
|
224
|
+
if not context:
|
|
225
|
+
console.print("[yellow]No relevant code found for that question.[/yellow]\n")
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
console.print("[bold green]Assistant:[/bold green] ", end="")
|
|
229
|
+
answer_parts: list[str] = []
|
|
230
|
+
try:
|
|
231
|
+
for chunk in llm.answer_stream(
|
|
232
|
+
question, context, api_key or "", history=history,
|
|
233
|
+
file_tree=file_tree, provider=provider, model=model,
|
|
234
|
+
):
|
|
235
|
+
console.print(chunk, end="")
|
|
236
|
+
answer_parts.append(chunk)
|
|
237
|
+
console.print("\n")
|
|
238
|
+
except (ClientError, ServerError) as e:
|
|
239
|
+
console.print(f"\n[red]Error:[/red] {e}\n")
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
# Append this turn to history for follow-up context
|
|
243
|
+
user_turn = f"Context:\n{context}\n\nQuestion: {question}"
|
|
244
|
+
history.append(llm._make_turn("user", user_turn))
|
|
245
|
+
history.append(llm._make_turn("model", "".join(answer_parts)))
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@app.command()
|
|
249
|
+
def stats(
|
|
250
|
+
project: Path = typer.Option(Path("."), "--project", "-p", help="Path to the indexed project"),
|
|
251
|
+
) -> None:
|
|
252
|
+
"""Show index statistics for a project."""
|
|
253
|
+
import datetime
|
|
254
|
+
project = project.resolve()
|
|
255
|
+
index_path = project / ".askmycode"
|
|
256
|
+
|
|
257
|
+
if not index_path.exists():
|
|
258
|
+
console.print(f"[red]No index found.[/red] Run: [bold]askmycode index {project}[/bold]")
|
|
259
|
+
raise typer.Exit(1)
|
|
260
|
+
|
|
261
|
+
mf = manifest.load(project)
|
|
262
|
+
client = store.get_client(project)
|
|
263
|
+
collection = store.get_collection(client)
|
|
264
|
+
n_chunks = collection.count()
|
|
265
|
+
|
|
266
|
+
last_modified = max((e["mtime"] for e in mf.values()), default=None)
|
|
267
|
+
last_indexed = (
|
|
268
|
+
datetime.datetime.fromtimestamp(last_modified).strftime("%Y-%m-%d %H:%M:%S")
|
|
269
|
+
if last_modified else "unknown"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
console.print(f"[bold]Files indexed:[/bold] {len(mf)}")
|
|
273
|
+
console.print(f"[bold]Total chunks:[/bold] {n_chunks}")
|
|
274
|
+
console.print(f"[bold]Last indexed:[/bold] {last_indexed}")
|
|
275
|
+
console.print(f"[bold]Index location:[/bold] {index_path}")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
_config_app = typer.Typer(help="Manage configuration.")
|
|
279
|
+
app.add_typer(_config_app, name="config")
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@_config_app.command("set-key")
|
|
283
|
+
def config_set_key(
|
|
284
|
+
api_key: str = typer.Argument(..., help="Your API key"),
|
|
285
|
+
) -> None:
|
|
286
|
+
"""Save your API key to ~/.config/askmycode/config.json."""
|
|
287
|
+
config.set_api_key(api_key)
|
|
288
|
+
console.print("[green]API key saved.[/green]")
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@_config_app.command("set-provider")
|
|
292
|
+
def config_set_provider(
|
|
293
|
+
provider: str = typer.Argument(..., help="Provider: gemini, openai, anthropic, ollama"),
|
|
294
|
+
) -> None:
|
|
295
|
+
"""Set the default LLM provider."""
|
|
296
|
+
valid = {"gemini", "openai", "anthropic", "ollama"}
|
|
297
|
+
if provider not in valid:
|
|
298
|
+
console.print(f"[red]Unknown provider.[/red] Choose: {', '.join(sorted(valid))}")
|
|
299
|
+
raise typer.Exit(1)
|
|
300
|
+
config.set_provider(provider)
|
|
301
|
+
console.print(f"[green]Provider set to '{provider}'.[/green]")
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@_config_app.command("set-model")
|
|
305
|
+
def config_set_model(
|
|
306
|
+
model: str = typer.Argument(..., help="Model name, e.g. gpt-4o, llama3, claude-opus-4-7"),
|
|
307
|
+
) -> None:
|
|
308
|
+
"""Set the default model name."""
|
|
309
|
+
config.set_model(model)
|
|
310
|
+
console.print(f"[green]Model set to '{model}'.[/green]")
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
@_config_app.command("show")
|
|
314
|
+
def config_show() -> None:
|
|
315
|
+
"""Show current configuration."""
|
|
316
|
+
console.print(f"[bold]Provider:[/bold] {config.get_provider()}")
|
|
317
|
+
console.print(f"[bold]Model:[/bold] {config.get_model() or '(default for provider)'}")
|
|
318
|
+
key = config.get_api_key(config.get_provider())
|
|
319
|
+
masked = (key[:8] + "..." + key[-4:]) if key and len(key) > 12 else ("(not set)" if not key else key)
|
|
320
|
+
console.print(f"[bold]API key:[/bold] {masked}")
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
if __name__ == "__main__":
|
|
324
|
+
app()
|
askmycode/config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
_CONFIG_DIR = Path.home() / ".config" / "askmycode"
|
|
6
|
+
_CONFIG_FILE = _CONFIG_DIR / "config.json"
|
|
7
|
+
|
|
8
|
+
_ENV_KEYS = {
|
|
9
|
+
"gemini": "GEMINI_API_KEY",
|
|
10
|
+
"openai": "OPENAI_API_KEY",
|
|
11
|
+
"anthropic": "ANTHROPIC_API_KEY",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _load() -> dict:
|
|
16
|
+
if _CONFIG_FILE.exists():
|
|
17
|
+
return json.loads(_CONFIG_FILE.read_text())
|
|
18
|
+
return {}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _save(data: dict) -> None:
|
|
22
|
+
_CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
23
|
+
_CONFIG_FILE.write_text(json.dumps(data, indent=2))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_api_key(provider: str = "gemini") -> str | None:
|
|
27
|
+
env_var = _ENV_KEYS.get(provider, f"{provider.upper()}_API_KEY")
|
|
28
|
+
if key := os.environ.get(env_var):
|
|
29
|
+
return key
|
|
30
|
+
return _load().get("api_key")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def set_api_key(key: str) -> None:
|
|
34
|
+
data = _load()
|
|
35
|
+
data["api_key"] = key
|
|
36
|
+
_save(data)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_provider() -> str:
|
|
40
|
+
return os.environ.get("ASKMYCODE_PROVIDER") or _load().get("provider", "gemini")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def set_provider(provider: str) -> None:
|
|
44
|
+
data = _load()
|
|
45
|
+
data["provider"] = provider
|
|
46
|
+
_save(data)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_model() -> str:
|
|
50
|
+
return os.environ.get("ASKMYCODE_MODEL") or _load().get("model", "")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def set_model(model: str) -> None:
|
|
54
|
+
data = _load()
|
|
55
|
+
data["model"] = model
|
|
56
|
+
_save(data)
|
askmycode/embedder.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
|
|
6
|
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
|
7
|
+
|
|
8
|
+
MODEL_NAME = "all-MiniLM-L6-v2"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@lru_cache(maxsize=1)
|
|
12
|
+
def _get_model():
|
|
13
|
+
# Import and load inside the function so we can suppress all stderr noise
|
|
14
|
+
# (HF Hub unauthenticated warning + tqdm progress bars) in one redirect.
|
|
15
|
+
old_stderr = sys.stderr
|
|
16
|
+
sys.stderr = io.StringIO()
|
|
17
|
+
try:
|
|
18
|
+
from sentence_transformers import SentenceTransformer
|
|
19
|
+
return SentenceTransformer(MODEL_NAME)
|
|
20
|
+
finally:
|
|
21
|
+
sys.stderr = old_stderr
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def embed(texts: list[str]) -> list[list[float]]:
|
|
25
|
+
model = _get_model()
|
|
26
|
+
return model.encode(texts, show_progress_bar=False).tolist()
|
askmycode/indexer.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
|
|
5
|
+
import pathspec
|
|
6
|
+
|
|
7
|
+
from askmycode import embedder, manifest, store
|
|
8
|
+
|
|
9
|
+
_CODE_EXTENSIONS = {
|
|
10
|
+
".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".java",
|
|
11
|
+
".c", ".cpp", ".h", ".hpp", ".rb", ".php", ".swift", ".kt",
|
|
12
|
+
".scala", ".sh", ".bash", ".yaml", ".yml", ".toml", ".json",
|
|
13
|
+
".md", ".sql", ".css", ".html", ".vue", ".svelte",
|
|
14
|
+
}
|
|
15
|
+
_MAX_FILE_BYTES = 1024 * 1024 # 1 MB
|
|
16
|
+
_CHUNK_LINES = 60
|
|
17
|
+
_OVERLAP_LINES = 15
|
|
18
|
+
_EMBED_BATCH = 64
|
|
19
|
+
|
|
20
|
+
_DEFAULT_IGNORES = [
|
|
21
|
+
".git", ".askmycode", "__pycache__", "node_modules",
|
|
22
|
+
".venv", "venv", "dist", "build", ".next", "coverage",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _load_gitignore(root: Path) -> pathspec.PathSpec:
|
|
27
|
+
patterns = list(_DEFAULT_IGNORES)
|
|
28
|
+
gitignore = root / ".gitignore"
|
|
29
|
+
if gitignore.exists():
|
|
30
|
+
patterns += gitignore.read_text().splitlines()
|
|
31
|
+
return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _scan_files(root: Path) -> list[Path]:
|
|
35
|
+
spec = _load_gitignore(root)
|
|
36
|
+
files = []
|
|
37
|
+
for path in root.rglob("*"):
|
|
38
|
+
if not path.is_file():
|
|
39
|
+
continue
|
|
40
|
+
if path.suffix.lower() not in _CODE_EXTENSIONS:
|
|
41
|
+
continue
|
|
42
|
+
if path.stat().st_size > _MAX_FILE_BYTES:
|
|
43
|
+
continue
|
|
44
|
+
if spec.match_file(str(path.relative_to(root))):
|
|
45
|
+
continue
|
|
46
|
+
files.append(path)
|
|
47
|
+
return sorted(files)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _ast_chunks(content: str, lines: list[str], relative: str) -> list[dict] | None:
|
|
51
|
+
"""Split a Python file at top-level function/class boundaries using AST.
|
|
52
|
+
Returns None if parsing fails (falls back to sliding window)."""
|
|
53
|
+
try:
|
|
54
|
+
tree = ast.parse(content)
|
|
55
|
+
except SyntaxError:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
top_nodes = [
|
|
59
|
+
n for n in ast.walk(tree)
|
|
60
|
+
if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef))
|
|
61
|
+
and isinstance(getattr(n, "col_offset", -1), int) and n.col_offset == 0
|
|
62
|
+
]
|
|
63
|
+
if not top_nodes:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
# Sort by line number and build boundary ranges
|
|
67
|
+
top_nodes.sort(key=lambda n: n.lineno)
|
|
68
|
+
boundaries: list[tuple[int, int]] = []
|
|
69
|
+
for i, node in enumerate(top_nodes):
|
|
70
|
+
start = node.lineno - 1
|
|
71
|
+
end = (top_nodes[i + 1].lineno - 2) if i + 1 < len(top_nodes) else len(lines)
|
|
72
|
+
boundaries.append((start, end))
|
|
73
|
+
|
|
74
|
+
# Module-level code before the first definition
|
|
75
|
+
if boundaries and boundaries[0][0] > 0:
|
|
76
|
+
boundaries.insert(0, (0, boundaries[0][0]))
|
|
77
|
+
|
|
78
|
+
chunks = []
|
|
79
|
+
for start, end in boundaries:
|
|
80
|
+
body = "\n".join(lines[start:end]).strip()
|
|
81
|
+
if body:
|
|
82
|
+
chunks.append({
|
|
83
|
+
"id": f"{relative}:{start}",
|
|
84
|
+
"content": f"# {relative} (lines {start + 1}–{end})\n\n{body}",
|
|
85
|
+
"file_path": relative,
|
|
86
|
+
"start_line": start + 1,
|
|
87
|
+
"end_line": end,
|
|
88
|
+
})
|
|
89
|
+
return chunks or None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _sliding_chunks(lines: list[str], relative: str) -> list[dict]:
|
|
93
|
+
chunks = []
|
|
94
|
+
i = 0
|
|
95
|
+
while i < len(lines):
|
|
96
|
+
end = min(i + _CHUNK_LINES, len(lines))
|
|
97
|
+
body = "\n".join(lines[i:end]).strip()
|
|
98
|
+
if body:
|
|
99
|
+
chunks.append({
|
|
100
|
+
"id": f"{relative}:{i}",
|
|
101
|
+
"content": f"# {relative} (lines {i + 1}–{end})\n\n{body}",
|
|
102
|
+
"file_path": relative,
|
|
103
|
+
"start_line": i + 1,
|
|
104
|
+
"end_line": end,
|
|
105
|
+
})
|
|
106
|
+
i += _CHUNK_LINES - _OVERLAP_LINES
|
|
107
|
+
return chunks
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _chunk_file(path: Path, root: Path) -> list[dict]:
|
|
111
|
+
try:
|
|
112
|
+
content = path.read_text(encoding="utf-8", errors="ignore")
|
|
113
|
+
except Exception:
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
lines = content.splitlines()
|
|
117
|
+
relative = str(path.relative_to(root))
|
|
118
|
+
|
|
119
|
+
if path.suffix == ".py":
|
|
120
|
+
result = _ast_chunks(content, lines, relative)
|
|
121
|
+
if result is not None:
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
return _sliding_chunks(lines, relative)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def index_directory(
|
|
128
|
+
root: Path,
|
|
129
|
+
progress_callback: Optional[Callable[[str], None]] = None,
|
|
130
|
+
) -> tuple[int, int]:
|
|
131
|
+
def log(msg: str) -> None:
|
|
132
|
+
if progress_callback:
|
|
133
|
+
progress_callback(msg)
|
|
134
|
+
|
|
135
|
+
log("Scanning files...")
|
|
136
|
+
files = _scan_files(root)
|
|
137
|
+
current_relatives = {str(p.relative_to(root)) for p in files}
|
|
138
|
+
|
|
139
|
+
mf = manifest.load(root)
|
|
140
|
+
client = store.get_client(root)
|
|
141
|
+
collection = store.get_collection(client)
|
|
142
|
+
|
|
143
|
+
# Remove chunks for files that were deleted since last index
|
|
144
|
+
deleted = [rel for rel in mf if rel not in current_relatives]
|
|
145
|
+
if deleted:
|
|
146
|
+
log(f"Removing {len(deleted)} deleted file(s)...")
|
|
147
|
+
store.delete_by_files(collection, deleted)
|
|
148
|
+
for rel in deleted:
|
|
149
|
+
del mf[rel]
|
|
150
|
+
|
|
151
|
+
# Only process files that are new or changed
|
|
152
|
+
changed_files = [p for p in files if manifest.is_changed(p, mf, str(p.relative_to(root)))]
|
|
153
|
+
skipped = len(files) - len(changed_files)
|
|
154
|
+
|
|
155
|
+
if not changed_files:
|
|
156
|
+
log(f"Nothing changed — {len(files)} files already up to date.")
|
|
157
|
+
return len(files), 0
|
|
158
|
+
|
|
159
|
+
if skipped:
|
|
160
|
+
log(f"Skipping {skipped} unchanged file(s)...")
|
|
161
|
+
|
|
162
|
+
all_chunks: list[dict] = []
|
|
163
|
+
for i, path in enumerate(changed_files):
|
|
164
|
+
log(f"Reading ({i + 1}/{len(changed_files)}) {path.name}")
|
|
165
|
+
all_chunks.extend(_chunk_file(path, root))
|
|
166
|
+
|
|
167
|
+
if all_chunks:
|
|
168
|
+
for i in range(0, len(all_chunks), _EMBED_BATCH):
|
|
169
|
+
batch = all_chunks[i: i + _EMBED_BATCH]
|
|
170
|
+
log(f"Embedding chunks {i + 1}–{min(i + _EMBED_BATCH, len(all_chunks))} / {len(all_chunks)}")
|
|
171
|
+
embeddings = embedder.embed([c["content"] for c in batch])
|
|
172
|
+
store.upsert_chunks(collection, batch, embeddings)
|
|
173
|
+
|
|
174
|
+
# Update manifest for changed files
|
|
175
|
+
for path in changed_files:
|
|
176
|
+
rel = str(path.relative_to(root))
|
|
177
|
+
mf[rel] = manifest.file_key(path)
|
|
178
|
+
manifest.save(root, mf)
|
|
179
|
+
|
|
180
|
+
return len(files), len(all_chunks)
|
askmycode/llm.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
from google import genai
|
|
3
|
+
from google.genai import types
|
|
4
|
+
|
|
5
|
+
_SYSTEM = """\
|
|
6
|
+
You are a helpful assistant that answers questions about a software codebase.
|
|
7
|
+
You are given relevant code snippets retrieved from the codebase and a question.
|
|
8
|
+
- Answer based on the provided snippets.
|
|
9
|
+
- Reference specific file paths and line numbers when relevant.
|
|
10
|
+
- If the answer cannot be determined from the snippets, say so clearly.
|
|
11
|
+
- Keep answers concise and developer-focused.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
# ── Gemini (default) ──────────────────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
_GEMINI_MODEL = "gemini-2.5-flash-lite"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _make_turn(role: str, text: str) -> types.Content:
|
|
20
|
+
return types.Content(role=role, parts=[types.Part(text=text)])
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_file_tree(file_paths: list[str]) -> str:
|
|
24
|
+
if not file_paths:
|
|
25
|
+
return ""
|
|
26
|
+
lines = ["Project structure:"] + sorted(f" {p}" for p in file_paths)
|
|
27
|
+
return "\n".join(lines)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _build_user_turn(question: str, context: str, file_tree: str) -> str:
|
|
31
|
+
tree_section = f"{file_tree}\n\n" if file_tree else ""
|
|
32
|
+
return (
|
|
33
|
+
f"{tree_section}"
|
|
34
|
+
f"Relevant code snippets:\n\n"
|
|
35
|
+
f"{context}\n\n"
|
|
36
|
+
f"Question: {question}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _stream_gemini(
|
|
41
|
+
question: str,
|
|
42
|
+
context: str,
|
|
43
|
+
api_key: str,
|
|
44
|
+
history: list[types.Content] | None,
|
|
45
|
+
file_tree: str,
|
|
46
|
+
model: str,
|
|
47
|
+
) -> Iterator[str]:
|
|
48
|
+
client = genai.Client(api_key=api_key)
|
|
49
|
+
user_turn = _build_user_turn(question, context, file_tree)
|
|
50
|
+
contents = list(history or []) + [_make_turn("user", user_turn)]
|
|
51
|
+
for chunk in client.models.generate_content_stream(
|
|
52
|
+
model=model,
|
|
53
|
+
contents=contents,
|
|
54
|
+
config=types.GenerateContentConfig(system_instruction=_SYSTEM),
|
|
55
|
+
):
|
|
56
|
+
if chunk.text:
|
|
57
|
+
yield chunk.text
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# ── OpenAI ────────────────────────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
def _stream_openai(
|
|
63
|
+
question: str,
|
|
64
|
+
context: str,
|
|
65
|
+
api_key: str,
|
|
66
|
+
history: list[dict] | None,
|
|
67
|
+
file_tree: str,
|
|
68
|
+
model: str,
|
|
69
|
+
) -> Iterator[str]:
|
|
70
|
+
from openai import OpenAI
|
|
71
|
+
client = OpenAI(api_key=api_key)
|
|
72
|
+
user_turn = _build_user_turn(question, context, file_tree)
|
|
73
|
+
messages = [{"role": "system", "content": _SYSTEM}]
|
|
74
|
+
for h in history or []:
|
|
75
|
+
messages.append({"role": h["role"], "content": h["content"]})
|
|
76
|
+
messages.append({"role": "user", "content": user_turn})
|
|
77
|
+
with client.chat.completions.create(model=model, messages=messages, stream=True) as stream:
|
|
78
|
+
for chunk in stream:
|
|
79
|
+
delta = chunk.choices[0].delta.content
|
|
80
|
+
if delta:
|
|
81
|
+
yield delta
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# ── Anthropic / Claude ────────────────────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
def _stream_anthropic(
|
|
87
|
+
question: str,
|
|
88
|
+
context: str,
|
|
89
|
+
api_key: str,
|
|
90
|
+
history: list[dict] | None,
|
|
91
|
+
file_tree: str,
|
|
92
|
+
model: str,
|
|
93
|
+
) -> Iterator[str]:
|
|
94
|
+
import anthropic
|
|
95
|
+
client = anthropic.Anthropic(api_key=api_key)
|
|
96
|
+
user_turn = _build_user_turn(question, context, file_tree)
|
|
97
|
+
messages = []
|
|
98
|
+
for h in history or []:
|
|
99
|
+
messages.append({"role": h["role"], "content": h["content"]})
|
|
100
|
+
messages.append({"role": "user", "content": user_turn})
|
|
101
|
+
with client.messages.stream(
|
|
102
|
+
model=model,
|
|
103
|
+
max_tokens=2048,
|
|
104
|
+
system=_SYSTEM,
|
|
105
|
+
messages=messages,
|
|
106
|
+
) as stream:
|
|
107
|
+
for text in stream.text_stream:
|
|
108
|
+
yield text
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ── Ollama (local) ────────────────────────────────────────────────────────────
|
|
112
|
+
|
|
113
|
+
def _stream_ollama(
|
|
114
|
+
question: str,
|
|
115
|
+
context: str,
|
|
116
|
+
history: list[dict] | None,
|
|
117
|
+
file_tree: str,
|
|
118
|
+
model: str,
|
|
119
|
+
base_url: str,
|
|
120
|
+
) -> Iterator[str]:
|
|
121
|
+
import requests, json
|
|
122
|
+
user_turn = _build_user_turn(question, context, file_tree)
|
|
123
|
+
messages = [{"role": "system", "content": _SYSTEM}]
|
|
124
|
+
for h in history or []:
|
|
125
|
+
messages.append({"role": h["role"], "content": h["content"]})
|
|
126
|
+
messages.append({"role": "user", "content": user_turn})
|
|
127
|
+
resp = requests.post(
|
|
128
|
+
f"{base_url}/api/chat",
|
|
129
|
+
json={"model": model, "messages": messages, "stream": True},
|
|
130
|
+
stream=True,
|
|
131
|
+
timeout=120,
|
|
132
|
+
)
|
|
133
|
+
resp.raise_for_status()
|
|
134
|
+
for line in resp.iter_lines():
|
|
135
|
+
if line:
|
|
136
|
+
data = json.loads(line)
|
|
137
|
+
text = data.get("message", {}).get("content", "")
|
|
138
|
+
if text:
|
|
139
|
+
yield text
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ── Public interface ──────────────────────────────────────────────────────────
|
|
143
|
+
|
|
144
|
+
def answer_stream(
|
|
145
|
+
question: str,
|
|
146
|
+
context: str,
|
|
147
|
+
api_key: str,
|
|
148
|
+
history=None,
|
|
149
|
+
file_tree: str = "",
|
|
150
|
+
provider: str = "gemini",
|
|
151
|
+
model: str = "",
|
|
152
|
+
ollama_url: str = "http://localhost:11434",
|
|
153
|
+
) -> Iterator[str]:
|
|
154
|
+
"""Stream an answer from the configured provider."""
|
|
155
|
+
if provider == "gemini":
|
|
156
|
+
yield from _stream_gemini(
|
|
157
|
+
question, context, api_key, history, file_tree,
|
|
158
|
+
model=model or _GEMINI_MODEL,
|
|
159
|
+
)
|
|
160
|
+
elif provider == "openai":
|
|
161
|
+
yield from _stream_openai(
|
|
162
|
+
question, context, api_key, history, file_tree,
|
|
163
|
+
model=model or "gpt-4o-mini",
|
|
164
|
+
)
|
|
165
|
+
elif provider == "anthropic":
|
|
166
|
+
yield from _stream_anthropic(
|
|
167
|
+
question, context, api_key, history, file_tree,
|
|
168
|
+
model=model or "claude-sonnet-4-6",
|
|
169
|
+
)
|
|
170
|
+
elif provider == "ollama":
|
|
171
|
+
yield from _stream_ollama(
|
|
172
|
+
question, context, history, file_tree,
|
|
173
|
+
model=model or "llama3",
|
|
174
|
+
base_url=ollama_url,
|
|
175
|
+
)
|
|
176
|
+
else:
|
|
177
|
+
raise ValueError(f"Unknown provider: {provider!r}. Choose: gemini, openai, anthropic, ollama")
|
askmycode/manifest.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
_MANIFEST_FILE = ".askmycode/manifest.json"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _path(project_root: Path) -> Path:
|
|
8
|
+
return project_root / _MANIFEST_FILE
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load(project_root: Path) -> dict:
|
|
12
|
+
p = _path(project_root)
|
|
13
|
+
if p.exists():
|
|
14
|
+
return json.loads(p.read_text())
|
|
15
|
+
return {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def save(project_root: Path, manifest: dict) -> None:
|
|
19
|
+
p = _path(project_root)
|
|
20
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
21
|
+
p.write_text(json.dumps(manifest, indent=2))
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def file_key(path: Path) -> dict:
|
|
25
|
+
stat = path.stat()
|
|
26
|
+
return {"mtime": stat.st_mtime, "size": stat.st_size}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def is_changed(path: Path, manifest: dict, relative: str) -> bool:
|
|
30
|
+
if relative not in manifest:
|
|
31
|
+
return True
|
|
32
|
+
entry = manifest[relative]
|
|
33
|
+
stat = path.stat()
|
|
34
|
+
return stat.st_mtime != entry["mtime"] or stat.st_size != entry["size"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Per-project configuration loaded from .askmycode.toml."""
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
import tomllib
|
|
6
|
+
except ImportError:
|
|
7
|
+
try:
|
|
8
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
9
|
+
except ImportError:
|
|
10
|
+
tomllib = None # type: ignore[assignment]
|
|
11
|
+
|
|
12
|
+
_CONFIG_FILE = ".askmycode.toml"
|
|
13
|
+
|
|
14
|
+
_DEFAULTS: dict = {
|
|
15
|
+
"provider": "",
|
|
16
|
+
"model": "",
|
|
17
|
+
"n_results": 8,
|
|
18
|
+
"include": [],
|
|
19
|
+
"exclude": [],
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load(project_root: Path) -> dict:
|
|
24
|
+
path = project_root / _CONFIG_FILE
|
|
25
|
+
if not path.exists() or tomllib is None:
|
|
26
|
+
return dict(_DEFAULTS)
|
|
27
|
+
try:
|
|
28
|
+
data = tomllib.loads(path.read_text())
|
|
29
|
+
cfg = dict(_DEFAULTS)
|
|
30
|
+
cfg.update(data.get("askmycode", {}))
|
|
31
|
+
return cfg
|
|
32
|
+
except Exception:
|
|
33
|
+
return dict(_DEFAULTS)
|
askmycode/reranker.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
|
|
6
|
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
|
7
|
+
|
|
8
|
+
_RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@lru_cache(maxsize=1)
|
|
12
|
+
def _get_cross_encoder():
|
|
13
|
+
old_stderr = sys.stderr
|
|
14
|
+
sys.stderr = io.StringIO()
|
|
15
|
+
try:
|
|
16
|
+
from sentence_transformers.cross_encoder import CrossEncoder
|
|
17
|
+
return CrossEncoder(_RERANK_MODEL)
|
|
18
|
+
finally:
|
|
19
|
+
sys.stderr = old_stderr
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def rerank(question: str, documents: list[str], top_k: int) -> list[int]:
|
|
23
|
+
"""Return indices of the top_k documents sorted by relevance to question."""
|
|
24
|
+
model = _get_cross_encoder()
|
|
25
|
+
pairs = [(question, doc) for doc in documents]
|
|
26
|
+
scores = model.predict(pairs)
|
|
27
|
+
ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
|
|
28
|
+
return ranked[:top_k]
|
askmycode/retriever.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from askmycode import embedder, reranker, store
|
|
4
|
+
|
|
5
|
+
_FETCH_MULTIPLIER = 3 # fetch this many extra candidates for re-ranking
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def retrieve(
|
|
9
|
+
question: str,
|
|
10
|
+
project_root: Path,
|
|
11
|
+
n_results: int = 8,
|
|
12
|
+
) -> tuple[str, list[dict]]:
|
|
13
|
+
"""Return (context_string, sources) where sources is a list of metadata dicts."""
|
|
14
|
+
client = store.get_client(project_root)
|
|
15
|
+
collection = store.get_collection(client)
|
|
16
|
+
|
|
17
|
+
fetch_n = min(n_results * _FETCH_MULTIPLIER, collection.count())
|
|
18
|
+
if fetch_n == 0:
|
|
19
|
+
return "", []
|
|
20
|
+
|
|
21
|
+
query_embedding = embedder.embed([question])[0]
|
|
22
|
+
results = store.search(collection, query_embedding, n_results=fetch_n)
|
|
23
|
+
|
|
24
|
+
documents: list[str] = results.get("documents", [[]])[0]
|
|
25
|
+
metadatas: list[dict] = results.get("metadatas", [[]])[0]
|
|
26
|
+
|
|
27
|
+
if not documents:
|
|
28
|
+
return "", []
|
|
29
|
+
|
|
30
|
+
# Re-rank candidates and keep the top n_results
|
|
31
|
+
top_indices = reranker.rerank(question, documents, top_k=n_results)
|
|
32
|
+
documents = [documents[i] for i in top_indices]
|
|
33
|
+
metadatas = [metadatas[i] for i in top_indices]
|
|
34
|
+
|
|
35
|
+
return "\n\n---\n\n".join(documents), metadatas
|
askmycode/store.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import chromadb
|
|
3
|
+
|
|
4
|
+
_INDEX_DIR = ".askmycode/chroma"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_client(project_root: Path) -> chromadb.PersistentClient:
|
|
8
|
+
index_path = project_root / _INDEX_DIR
|
|
9
|
+
index_path.mkdir(parents=True, exist_ok=True)
|
|
10
|
+
return chromadb.PersistentClient(path=str(index_path))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_collection(client: chromadb.PersistentClient) -> chromadb.Collection:
|
|
14
|
+
return client.get_or_create_collection(
|
|
15
|
+
name="codebase",
|
|
16
|
+
metadata={"hnsw:space": "cosine"},
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upsert_chunks(
|
|
21
|
+
collection: chromadb.Collection,
|
|
22
|
+
chunks: list[dict],
|
|
23
|
+
embeddings: list[list[float]],
|
|
24
|
+
) -> None:
|
|
25
|
+
collection.upsert(
|
|
26
|
+
ids=[c["id"] for c in chunks],
|
|
27
|
+
embeddings=embeddings,
|
|
28
|
+
documents=[c["content"] for c in chunks],
|
|
29
|
+
metadatas=[
|
|
30
|
+
{k: v for k, v in c.items() if k not in ("id", "content")}
|
|
31
|
+
for c in chunks
|
|
32
|
+
],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def delete_by_files(collection: chromadb.Collection, file_paths: list[str]) -> None:
|
|
37
|
+
"""Remove all chunks belonging to the given relative file paths."""
|
|
38
|
+
for file_path in file_paths:
|
|
39
|
+
results = collection.get(where={"file_path": file_path}, include=[])
|
|
40
|
+
ids = results.get("ids", [])
|
|
41
|
+
if ids:
|
|
42
|
+
collection.delete(ids=ids)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def search(
|
|
46
|
+
collection: chromadb.Collection,
|
|
47
|
+
query_embedding: list[float],
|
|
48
|
+
n_results: int = 8,
|
|
49
|
+
) -> dict:
|
|
50
|
+
return collection.query(
|
|
51
|
+
query_embeddings=[query_embedding],
|
|
52
|
+
n_results=n_results,
|
|
53
|
+
include=["documents", "metadatas", "distances"],
|
|
54
|
+
)
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: askmycode
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Ask questions about any codebase using AI — your code stays private
|
|
5
|
+
Project-URL: Homepage, https://github.com/NewtYao/askmycode
|
|
6
|
+
Project-URL: Issues, https://github.com/NewtYao/askmycode/issues
|
|
7
|
+
License: MIT
|
|
8
|
+
Keywords: ai,cli,codebase,developer-tools,embeddings,qa,rag
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Requires-Dist: chromadb>=0.5.0
|
|
20
|
+
Requires-Dist: google-genai>=2.0.0
|
|
21
|
+
Requires-Dist: pathspec>=0.12.0
|
|
22
|
+
Requires-Dist: rich>=13.0.0
|
|
23
|
+
Requires-Dist: sentence-transformers>=3.0.0
|
|
24
|
+
Requires-Dist: tomli>=2.0.0; python_version < '3.11'
|
|
25
|
+
Requires-Dist: typer>=0.12.0
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# askmycode
|
|
29
|
+
|
|
30
|
+
**Ask questions about any codebase in plain English. Your code never leaves your machine.**
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
$ askmycode ask "how does authentication work?"
|
|
34
|
+
|
|
35
|
+
Authentication is handled in src/auth/middleware.py (lines 12–45).
|
|
36
|
+
The require_auth decorator validates a JWT from the Authorization header,
|
|
37
|
+
looks up the user in the session store (Redis), and attaches the user object
|
|
38
|
+
to the request context. Unauthenticated requests are rejected with a 401...
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+

|
|
42
|
+
|
|
43
|
+
[](https://pypi.org/project/askmycode/)
|
|
44
|
+
[](https://www.python.org/)
|
|
45
|
+
[](LICENSE)
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Why askmycode?
|
|
50
|
+
|
|
51
|
+
Most AI coding tools send your entire codebase to a remote server. **askmycode doesn't.**
|
|
52
|
+
|
|
53
|
+
- **Your code stays local.** Only the question + a few relevant snippets reach the AI.
|
|
54
|
+
- **Works on any codebase** — Python, Go, TypeScript, Rust, Java, and more.
|
|
55
|
+
- **No more grepping.** Ask "where is the rate limiter?" instead of `grep -r "rate" .`
|
|
56
|
+
- **Onboarding superpower.** New to a repo? Ask questions instead of reading every file.
|
|
57
|
+
- **Multi-provider.** Gemini (free tier), OpenAI, Claude, or fully offline via Ollama.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## How it works
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Your code ──► chunked locally ──► embedded locally ──► stored in local vector DB
|
|
65
|
+
│
|
|
66
|
+
Your question ──► embed locally ──► find top chunks ──► send to AI ──► Answer
|
|
67
|
+
↑
|
|
68
|
+
only ~500 lines of relevant code
|
|
69
|
+
travel to the AI, not your whole repo
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Embeddings run on your machine via `sentence-transformers`. Only the retrieved snippets and your question are sent to the LLM.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Install
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install askmycode
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Or from source:
|
|
83
|
+
```bash
|
|
84
|
+
git clone https://github.com/NewtYao/askmycode
|
|
85
|
+
cd askmycode
|
|
86
|
+
pip install -e .
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Quick start
|
|
92
|
+
|
|
93
|
+
**1. Set your API key**
|
|
94
|
+
```bash
|
|
95
|
+
askmycode config set-key YOUR_GEMINI_KEY # get a free key at aistudio.google.com
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
**2. Initialise your project** *(optional — sets up .gitignore and suggests starter questions)*
|
|
99
|
+
```bash
|
|
100
|
+
cd your-project/
|
|
101
|
+
askmycode init .
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**3. Index your codebase** *(run once; re-run after big changes)*
|
|
105
|
+
```bash
|
|
106
|
+
askmycode index .
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**4. Ask questions**
|
|
110
|
+
```bash
|
|
111
|
+
askmycode ask "where is the database connection configured?"
|
|
112
|
+
askmycode ask "how does the retry logic work?"
|
|
113
|
+
askmycode ask "what does UserService do?"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Commands
|
|
119
|
+
|
|
120
|
+
### `askmycode ask`
|
|
121
|
+
```bash
|
|
122
|
+
askmycode ask "your question"
|
|
123
|
+
--sources, -s Show which files and lines were used to answer
|
|
124
|
+
--results, -n INT Chunks to retrieve (default: 8)
|
|
125
|
+
--provider TEXT gemini | openai | anthropic | ollama
|
|
126
|
+
--model TEXT Model name override
|
|
127
|
+
--project, -p PATH Path to indexed project (default: .)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# Show sources alongside the answer
|
|
132
|
+
askmycode ask "how is caching implemented?" --sources
|
|
133
|
+
|
|
134
|
+
# Use a different provider for one question
|
|
135
|
+
askmycode ask "what does this codebase do?" --provider openai
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### `askmycode chat`
|
|
139
|
+
Multi-turn conversation — follow-up questions remember previous answers.
|
|
140
|
+
```bash
|
|
141
|
+
askmycode chat
|
|
142
|
+
|
|
143
|
+
You: how does auth work?
|
|
144
|
+
Assistant: ...
|
|
145
|
+
|
|
146
|
+
You: what about the refresh token flow? # remembers the previous answer
|
|
147
|
+
Assistant: ...
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### `askmycode index`
|
|
151
|
+
```bash
|
|
152
|
+
askmycode index . # index current directory
|
|
153
|
+
askmycode index /path/to/project # index a specific path
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Re-indexing is **incremental** — only changed files are re-embedded.
|
|
157
|
+
|
|
158
|
+
### `askmycode stats`
|
|
159
|
+
```bash
|
|
160
|
+
askmycode stats
|
|
161
|
+
|
|
162
|
+
Files indexed: 42
|
|
163
|
+
Total chunks: 187
|
|
164
|
+
Last indexed: 2026-05-23 14:30:01
|
|
165
|
+
Index location: /your/project/.askmycode
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### `askmycode init`
|
|
169
|
+
```bash
|
|
170
|
+
askmycode init .
|
|
171
|
+
```
|
|
172
|
+
Detects your project type (Python, Go, Node, Rust...), adds `.askmycode/` to `.gitignore`, and prints suggested first questions.
|
|
173
|
+
|
|
174
|
+
### `askmycode config`
|
|
175
|
+
```bash
|
|
176
|
+
askmycode config set-key YOUR_API_KEY
|
|
177
|
+
askmycode config set-provider gemini # gemini | openai | anthropic | ollama
|
|
178
|
+
askmycode config set-model gpt-4o # optional model override
|
|
179
|
+
askmycode config show
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Multiple providers
|
|
185
|
+
|
|
186
|
+
| Provider | Command | Notes |
|
|
187
|
+
|---|---|---|
|
|
188
|
+
| **Gemini** (default) | `config set-provider gemini` | Free tier at aistudio.google.com |
|
|
189
|
+
| **OpenAI** | `config set-provider openai` | Requires `OPENAI_API_KEY` |
|
|
190
|
+
| **Claude** | `config set-provider anthropic` | Requires `ANTHROPIC_API_KEY` |
|
|
191
|
+
| **Ollama** | `config set-provider ollama` | 100% offline, no API key |
|
|
192
|
+
|
|
193
|
+
### Fully offline with Ollama
|
|
194
|
+
```bash
|
|
195
|
+
# 1. Install Ollama: https://ollama.com
|
|
196
|
+
# 2. Pull a model
|
|
197
|
+
ollama pull llama3
|
|
198
|
+
|
|
199
|
+
# 3. Use it
|
|
200
|
+
askmycode config set-provider ollama
|
|
201
|
+
askmycode config set-model llama3
|
|
202
|
+
askmycode ask "how does this work?" # zero internet required
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Per-project config
|
|
208
|
+
|
|
209
|
+
Commit an `.askmycode.toml` to share settings with your team:
|
|
210
|
+
|
|
211
|
+
```toml
|
|
212
|
+
[askmycode]
|
|
213
|
+
provider = "gemini"
|
|
214
|
+
model = "gemini-2.5-flash-lite"
|
|
215
|
+
n_results = 10
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## GitHub Action
|
|
221
|
+
|
|
222
|
+
Add AI-powered Q&A directly to your pull requests. Team members comment `/ask <question>` and get an instant answer.
|
|
223
|
+
|
|
224
|
+
**Setup:**
|
|
225
|
+
1. Add your API key as a repository secret: `GEMINI_API_KEY`
|
|
226
|
+
2. Create `.github/workflows/askmycode.yml` (see [example](.github/workflows/askmycode.yml))
|
|
227
|
+
|
|
228
|
+
**Usage in a PR:**
|
|
229
|
+
```
|
|
230
|
+
/ask where is error handling for the payment flow?
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
The bot replies with the answer + source file references, inline in the PR.
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Privacy
|
|
238
|
+
|
|
239
|
+
| What happens | Where it goes |
|
|
240
|
+
|---|---|
|
|
241
|
+
| Your full codebase | **Stays on your machine** |
|
|
242
|
+
| Embeddings (vector index) | **Stored in `.askmycode/` in your project** |
|
|
243
|
+
| Your question + ~500 lines of relevant code | Sent to the LLM API |
|
|
244
|
+
|
|
245
|
+
Use `--provider ollama` for zero data leaving your machine.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Supported file types
|
|
250
|
+
|
|
251
|
+
Python · JavaScript · TypeScript · Go · Rust · Java · C/C++ · Ruby · PHP · Swift · Kotlin · Scala · Shell · SQL · YAML · JSON · TOML · Markdown · HTML · CSS · Vue · Svelte
|
|
252
|
+
|
|
253
|
+
Python files are chunked at **function and class boundaries** using AST parsing, giving higher-quality retrieval than fixed-size splitting.
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## How retrieval works (technical)
|
|
258
|
+
|
|
259
|
+
1. **Index:** Files are chunked (AST-aware for Python, sliding window for others), embedded with `all-MiniLM-L6-v2`, and stored in a local ChromaDB.
|
|
260
|
+
2. **Query:** Your question is embedded. The top `n × 3` candidates are fetched by cosine similarity, then re-ranked with a cross-encoder (`ms-marco-MiniLM-L-6-v2`) for higher precision.
|
|
261
|
+
3. **Answer:** The top N chunks + a project file tree are sent to the LLM with a developer-focused system prompt. The answer streams back in real time.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Roadmap
|
|
266
|
+
|
|
267
|
+
- [ ] VS Code extension
|
|
268
|
+
- [ ] Watch mode (auto re-index on file save)
|
|
269
|
+
- [ ] Semantic diff Q&A (`askmycode ask` about uncommitted changes)
|
|
270
|
+
- [ ] Kotlin / Swift AST chunking
|
|
271
|
+
- [ ] Web UI
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## Contributing
|
|
276
|
+
|
|
277
|
+
PRs welcome. Run the tool against itself to test:
|
|
278
|
+
```bash
|
|
279
|
+
askmycode index .
|
|
280
|
+
askmycode ask "how does the indexer work?"
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
---
|
|
284
|
+
|
|
285
|
+
## License
|
|
286
|
+
|
|
287
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
askmycode/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
2
|
+
askmycode/cli.py,sha256=ZLnz2bcqi5avUtCs_nDL5qoAAD7FC0EIs6HokMUPBM0,13256
|
|
3
|
+
askmycode/config.py,sha256=NhriG4MtCobu-wy4axZjaHnpWUdtMSiZ3TVw-i7mgGA,1260
|
|
4
|
+
askmycode/embedder.py,sha256=F6ILzkBJ_KU3vK7_jko3UPRM5uI2lX41sWgwfuDCmqg,715
|
|
5
|
+
askmycode/indexer.py,sha256=3Isqm5uYY9GmZRQV-aDFI5IVdXCBSaQ9MJtDkL1Fouk,5838
|
|
6
|
+
askmycode/llm.py,sha256=HTnTYL8HUA2sgq_VqT5eRE0-zD0LxO8mGUNeVTnpuRA,6179
|
|
7
|
+
askmycode/manifest.py,sha256=vaG4QFstF6uuEJ2sFMHwyE8Vg3x5edoW9kGLi6kyaXQ,856
|
|
8
|
+
askmycode/project_config.py,sha256=3KUXIzPdnr3ndbnuHF4Qk_Utw7qputloJaDtFfru4aQ,784
|
|
9
|
+
askmycode/reranker.py,sha256=_RB8bQZs1JD01-VkSu7d1kC7rRSeLL97EgjnWMqqiZ4,832
|
|
10
|
+
askmycode/retriever.py,sha256=1IngYhXszKLxc4A06jSr2BOEGMwvskwYmZ_TWzX6sVU,1128
|
|
11
|
+
askmycode/store.py,sha256=QdIq5ZKkBLMlc-j-Lf6TJVbI2yBE-RXfedhrPd251Tc,1545
|
|
12
|
+
askmycode-0.1.0.dist-info/METADATA,sha256=0Rnnw48uv6eXzwpJpymnUd3lmcISgg1HADlSwThzkB4,8284
|
|
13
|
+
askmycode-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
14
|
+
askmycode-0.1.0.dist-info/entry_points.txt,sha256=t_P3Y-FDI4c3VGAnd8DV9e5Jcjx7uu4H3RiOPWix8sc,48
|
|
15
|
+
askmycode-0.1.0.dist-info/RECORD,,
|