hanzo-mcp 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hanzo-mcp might be problematic. Click here for more details.
- hanzo_mcp/__init__.py +1 -1
- hanzo_mcp/config/settings.py +61 -0
- hanzo_mcp/tools/__init__.py +158 -12
- hanzo_mcp/tools/common/base.py +7 -2
- hanzo_mcp/tools/common/config_tool.py +396 -0
- hanzo_mcp/tools/common/stats.py +261 -0
- hanzo_mcp/tools/common/tool_disable.py +144 -0
- hanzo_mcp/tools/common/tool_enable.py +182 -0
- hanzo_mcp/tools/common/tool_list.py +263 -0
- hanzo_mcp/tools/database/__init__.py +71 -0
- hanzo_mcp/tools/database/database_manager.py +246 -0
- hanzo_mcp/tools/database/graph_add.py +257 -0
- hanzo_mcp/tools/database/graph_query.py +536 -0
- hanzo_mcp/tools/database/graph_remove.py +267 -0
- hanzo_mcp/tools/database/graph_search.py +348 -0
- hanzo_mcp/tools/database/graph_stats.py +345 -0
- hanzo_mcp/tools/database/sql_query.py +229 -0
- hanzo_mcp/tools/database/sql_search.py +296 -0
- hanzo_mcp/tools/database/sql_stats.py +254 -0
- hanzo_mcp/tools/editor/__init__.py +11 -0
- hanzo_mcp/tools/editor/neovim_command.py +272 -0
- hanzo_mcp/tools/editor/neovim_edit.py +290 -0
- hanzo_mcp/tools/editor/neovim_session.py +356 -0
- hanzo_mcp/tools/filesystem/__init__.py +20 -1
- hanzo_mcp/tools/filesystem/batch_search.py +812 -0
- hanzo_mcp/tools/filesystem/find_files.py +348 -0
- hanzo_mcp/tools/filesystem/git_search.py +505 -0
- hanzo_mcp/tools/llm/__init__.py +27 -0
- hanzo_mcp/tools/llm/consensus_tool.py +351 -0
- hanzo_mcp/tools/llm/llm_manage.py +413 -0
- hanzo_mcp/tools/llm/llm_tool.py +346 -0
- hanzo_mcp/tools/llm/provider_tools.py +412 -0
- hanzo_mcp/tools/mcp/__init__.py +11 -0
- hanzo_mcp/tools/mcp/mcp_add.py +263 -0
- hanzo_mcp/tools/mcp/mcp_remove.py +127 -0
- hanzo_mcp/tools/mcp/mcp_stats.py +165 -0
- hanzo_mcp/tools/shell/__init__.py +27 -7
- hanzo_mcp/tools/shell/logs.py +265 -0
- hanzo_mcp/tools/shell/npx.py +194 -0
- hanzo_mcp/tools/shell/npx_background.py +254 -0
- hanzo_mcp/tools/shell/pkill.py +262 -0
- hanzo_mcp/tools/shell/processes.py +279 -0
- hanzo_mcp/tools/shell/run_background.py +326 -0
- hanzo_mcp/tools/shell/uvx.py +187 -0
- hanzo_mcp/tools/shell/uvx_background.py +249 -0
- hanzo_mcp/tools/vector/__init__.py +21 -12
- hanzo_mcp/tools/vector/ast_analyzer.py +459 -0
- hanzo_mcp/tools/vector/git_ingester.py +485 -0
- hanzo_mcp/tools/vector/index_tool.py +358 -0
- hanzo_mcp/tools/vector/infinity_store.py +465 -1
- hanzo_mcp/tools/vector/mock_infinity.py +162 -0
- hanzo_mcp/tools/vector/vector_index.py +7 -6
- hanzo_mcp/tools/vector/vector_search.py +22 -7
- {hanzo_mcp-0.5.0.dist-info → hanzo_mcp-0.5.2.dist-info}/METADATA +68 -20
- hanzo_mcp-0.5.2.dist-info/RECORD +106 -0
- hanzo_mcp-0.5.0.dist-info/RECORD +0 -63
- {hanzo_mcp-0.5.0.dist-info → hanzo_mcp-0.5.2.dist-info}/WHEEL +0 -0
- {hanzo_mcp-0.5.0.dist-info → hanzo_mcp-0.5.2.dist-info}/entry_points.txt +0 -0
- {hanzo_mcp-0.5.0.dist-info → hanzo_mcp-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {hanzo_mcp-0.5.0.dist-info → hanzo_mcp-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
"""Git repository ingester for comprehensive code indexing.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to ingest entire git repositories including:
|
|
4
|
+
- Full git history and commit metadata
|
|
5
|
+
- File contents at different points in time
|
|
6
|
+
- AST analysis via tree-sitter
|
|
7
|
+
- Symbol extraction and cross-references
|
|
8
|
+
- Blame information for line-level attribution
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import subprocess
|
|
12
|
+
import json
|
|
13
|
+
import hashlib
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
from .infinity_store import InfinityVectorStore
|
|
21
|
+
from .ast_analyzer import ASTAnalyzer
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class GitCommit:
|
|
28
|
+
"""Represents a git commit."""
|
|
29
|
+
hash: str
|
|
30
|
+
author: str
|
|
31
|
+
author_email: str
|
|
32
|
+
timestamp: int
|
|
33
|
+
message: str
|
|
34
|
+
files: List[Dict[str, str]] # [{'status': 'M', 'filename': 'main.py'}]
|
|
35
|
+
parent_hashes: List[str]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class GitFileHistory:
|
|
40
|
+
"""History of a single file."""
|
|
41
|
+
file_path: str
|
|
42
|
+
commits: List[GitCommit]
|
|
43
|
+
current_content: Optional[str]
|
|
44
|
+
line_blame: Dict[int, Dict[str, Any]] # line_number -> blame info
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class GitIngester:
|
|
48
|
+
"""Ingests git repositories into vector store."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, vector_store: InfinityVectorStore):
|
|
51
|
+
"""Initialize the git ingester.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
vector_store: The vector store to ingest into
|
|
55
|
+
"""
|
|
56
|
+
self.vector_store = vector_store
|
|
57
|
+
self.ast_analyzer = ASTAnalyzer()
|
|
58
|
+
self._commit_cache: Dict[str, GitCommit] = {}
|
|
59
|
+
|
|
60
|
+
def ingest_repository(
|
|
61
|
+
self,
|
|
62
|
+
repo_path: str,
|
|
63
|
+
branch: str = "HEAD",
|
|
64
|
+
include_history: bool = True,
|
|
65
|
+
include_diffs: bool = True,
|
|
66
|
+
include_blame: bool = True,
|
|
67
|
+
file_patterns: Optional[List[str]] = None
|
|
68
|
+
) -> Dict[str, Any]:
|
|
69
|
+
"""Ingest an entire git repository.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
repo_path: Path to the git repository
|
|
73
|
+
branch: Branch to ingest (default: HEAD)
|
|
74
|
+
include_history: Whether to include commit history
|
|
75
|
+
include_diffs: Whether to include diff information
|
|
76
|
+
include_blame: Whether to include blame information
|
|
77
|
+
file_patterns: List of file patterns to include (e.g., ["*.py", "*.js"])
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Summary of ingestion results
|
|
81
|
+
"""
|
|
82
|
+
repo_path = Path(repo_path)
|
|
83
|
+
if not (repo_path / ".git").exists():
|
|
84
|
+
raise ValueError(f"Not a git repository: {repo_path}")
|
|
85
|
+
|
|
86
|
+
logger.info(f"Starting ingestion of repository: {repo_path}")
|
|
87
|
+
|
|
88
|
+
results = {
|
|
89
|
+
"repository": str(repo_path),
|
|
90
|
+
"branch": branch,
|
|
91
|
+
"commits_processed": 0,
|
|
92
|
+
"commits_indexed": 0,
|
|
93
|
+
"files_indexed": 0,
|
|
94
|
+
"symbols_extracted": 0,
|
|
95
|
+
"diffs_indexed": 0,
|
|
96
|
+
"blame_entries": 0,
|
|
97
|
+
"errors": []
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
# Get current branch/commit
|
|
102
|
+
current_commit = self._get_current_commit(repo_path)
|
|
103
|
+
results["current_commit"] = current_commit
|
|
104
|
+
|
|
105
|
+
# Get list of files to process
|
|
106
|
+
files = self._get_repository_files(repo_path, file_patterns)
|
|
107
|
+
logger.info(f"Found {len(files)} files to process")
|
|
108
|
+
|
|
109
|
+
# Process each file
|
|
110
|
+
for file_path in files:
|
|
111
|
+
try:
|
|
112
|
+
self._process_file(
|
|
113
|
+
repo_path,
|
|
114
|
+
file_path,
|
|
115
|
+
include_history=include_history,
|
|
116
|
+
include_blame=include_blame,
|
|
117
|
+
results=results
|
|
118
|
+
)
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.error(f"Error processing {file_path}: {e}")
|
|
121
|
+
results["errors"].append(f"{file_path}: {str(e)}")
|
|
122
|
+
|
|
123
|
+
# Process commit history if requested
|
|
124
|
+
if include_history:
|
|
125
|
+
commits = self._get_commit_history(repo_path, branch)
|
|
126
|
+
results["commits_processed"] = len(commits)
|
|
127
|
+
|
|
128
|
+
for commit in commits:
|
|
129
|
+
self._index_commit(commit, include_diffs=include_diffs)
|
|
130
|
+
results["commits_indexed"] = results.get("commits_indexed", 0) + 1
|
|
131
|
+
|
|
132
|
+
if include_diffs:
|
|
133
|
+
results["diffs_indexed"] += len(commit.files)
|
|
134
|
+
|
|
135
|
+
# Create repository metadata document
|
|
136
|
+
self._index_repository_metadata(repo_path, results)
|
|
137
|
+
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.error(f"Repository ingestion failed: {e}")
|
|
140
|
+
results["errors"].append(f"Fatal error: {str(e)}")
|
|
141
|
+
|
|
142
|
+
logger.info(f"Ingestion complete: {results}")
|
|
143
|
+
return results
|
|
144
|
+
|
|
145
|
+
def _get_current_commit(self, repo_path: Path) -> str:
|
|
146
|
+
"""Get the current commit hash."""
|
|
147
|
+
result = subprocess.run(
|
|
148
|
+
["git", "rev-parse", "HEAD"],
|
|
149
|
+
cwd=repo_path,
|
|
150
|
+
capture_output=True,
|
|
151
|
+
text=True,
|
|
152
|
+
check=True
|
|
153
|
+
)
|
|
154
|
+
return result.stdout.strip()
|
|
155
|
+
|
|
156
|
+
def _get_repository_files(
|
|
157
|
+
self,
|
|
158
|
+
repo_path: Path,
|
|
159
|
+
patterns: Optional[List[str]] = None
|
|
160
|
+
) -> List[Path]:
|
|
161
|
+
"""Get list of files in repository matching patterns."""
|
|
162
|
+
# Use git ls-files to respect .gitignore
|
|
163
|
+
cmd = ["git", "ls-files"]
|
|
164
|
+
|
|
165
|
+
result = subprocess.run(
|
|
166
|
+
cmd,
|
|
167
|
+
cwd=repo_path,
|
|
168
|
+
capture_output=True,
|
|
169
|
+
text=True,
|
|
170
|
+
check=True
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
files = []
|
|
174
|
+
for line in result.stdout.strip().split('\n'):
|
|
175
|
+
if line:
|
|
176
|
+
file_path = repo_path / line
|
|
177
|
+
if file_path.exists():
|
|
178
|
+
# Apply pattern filtering if specified
|
|
179
|
+
if patterns:
|
|
180
|
+
if any(file_path.match(pattern) for pattern in patterns):
|
|
181
|
+
files.append(file_path)
|
|
182
|
+
else:
|
|
183
|
+
files.append(file_path)
|
|
184
|
+
|
|
185
|
+
return files
|
|
186
|
+
|
|
187
|
+
def _get_commit_history(
|
|
188
|
+
self,
|
|
189
|
+
repo_path: Path,
|
|
190
|
+
branch: str = "HEAD",
|
|
191
|
+
max_commits: int = 1000
|
|
192
|
+
) -> List[GitCommit]:
|
|
193
|
+
"""Get commit history for the repository."""
|
|
194
|
+
# Get commit list with basic info
|
|
195
|
+
result = subprocess.run(
|
|
196
|
+
["git", "log", branch, f"--max-count={max_commits}",
|
|
197
|
+
"--pretty=format:%H|%P|%an|%ae|%at|%s"],
|
|
198
|
+
cwd=repo_path,
|
|
199
|
+
capture_output=True,
|
|
200
|
+
text=True,
|
|
201
|
+
check=True
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
commits = []
|
|
205
|
+
for line in result.stdout.strip().split('\n'):
|
|
206
|
+
if line:
|
|
207
|
+
parts = line.split('|', 5)
|
|
208
|
+
if len(parts) >= 6:
|
|
209
|
+
commit_hash = parts[0]
|
|
210
|
+
parent_hashes = parts[1].split() if parts[1] else []
|
|
211
|
+
|
|
212
|
+
# Get file changes for this commit
|
|
213
|
+
files = self._get_commit_files(repo_path, commit_hash)
|
|
214
|
+
|
|
215
|
+
commit = GitCommit(
|
|
216
|
+
hash=commit_hash,
|
|
217
|
+
parent_hashes=parent_hashes,
|
|
218
|
+
author=parts[2],
|
|
219
|
+
author_email=parts[3],
|
|
220
|
+
timestamp=int(parts[4]),
|
|
221
|
+
message=parts[5],
|
|
222
|
+
files=files
|
|
223
|
+
)
|
|
224
|
+
commits.append(commit)
|
|
225
|
+
self._commit_cache[commit_hash] = commit
|
|
226
|
+
|
|
227
|
+
return commits
|
|
228
|
+
|
|
229
|
+
def _get_commit_files(self, repo_path: Path, commit_hash: str) -> List[Dict[str, str]]:
|
|
230
|
+
"""Get list of files changed in a commit."""
|
|
231
|
+
result = subprocess.run(
|
|
232
|
+
["git", "show", "--name-status", "--format=", commit_hash],
|
|
233
|
+
cwd=repo_path,
|
|
234
|
+
capture_output=True,
|
|
235
|
+
text=True,
|
|
236
|
+
check=True
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
files = []
|
|
240
|
+
for line in result.stdout.strip().split('\n'):
|
|
241
|
+
if line and '\t' in line:
|
|
242
|
+
parts = line.split('\t', 1)
|
|
243
|
+
if len(parts) == 2:
|
|
244
|
+
files.append({
|
|
245
|
+
'status': parts[0],
|
|
246
|
+
'filename': parts[1]
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
return files
|
|
250
|
+
|
|
251
|
+
def _process_file(
|
|
252
|
+
self,
|
|
253
|
+
repo_path: Path,
|
|
254
|
+
file_path: Path,
|
|
255
|
+
include_history: bool,
|
|
256
|
+
include_blame: bool,
|
|
257
|
+
results: Dict[str, Any]
|
|
258
|
+
):
|
|
259
|
+
"""Process a single file."""
|
|
260
|
+
relative_path = file_path.relative_to(repo_path)
|
|
261
|
+
|
|
262
|
+
# Read current content
|
|
263
|
+
try:
|
|
264
|
+
content = file_path.read_text(encoding='utf-8')
|
|
265
|
+
except UnicodeDecodeError:
|
|
266
|
+
content = file_path.read_text(encoding='latin-1')
|
|
267
|
+
|
|
268
|
+
# Get file metadata
|
|
269
|
+
metadata = {
|
|
270
|
+
'repository': str(repo_path),
|
|
271
|
+
'relative_path': str(relative_path),
|
|
272
|
+
'file_type': file_path.suffix,
|
|
273
|
+
'size': file_path.stat().st_size,
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
# Add git history metadata if requested
|
|
277
|
+
if include_history:
|
|
278
|
+
history = self._get_file_history(repo_path, relative_path)
|
|
279
|
+
metadata['commit_count'] = len(history)
|
|
280
|
+
if history:
|
|
281
|
+
metadata['first_commit'] = history[-1]['hash']
|
|
282
|
+
metadata['last_commit'] = history[0]['hash']
|
|
283
|
+
metadata['last_modified'] = datetime.fromtimestamp(
|
|
284
|
+
history[0]['timestamp']
|
|
285
|
+
).isoformat()
|
|
286
|
+
|
|
287
|
+
# Add blame information if requested
|
|
288
|
+
if include_blame:
|
|
289
|
+
blame_data = self._get_file_blame(repo_path, relative_path)
|
|
290
|
+
metadata['unique_authors'] = len(set(
|
|
291
|
+
b['author'] for b in blame_data.values()
|
|
292
|
+
))
|
|
293
|
+
|
|
294
|
+
# Index the file content
|
|
295
|
+
doc_ids = self.vector_store.add_file(
|
|
296
|
+
str(file_path),
|
|
297
|
+
chunk_size=1000,
|
|
298
|
+
chunk_overlap=200,
|
|
299
|
+
metadata=metadata
|
|
300
|
+
)
|
|
301
|
+
results["files_indexed"] += 1
|
|
302
|
+
|
|
303
|
+
# Perform AST analysis for supported languages
|
|
304
|
+
if file_path.suffix in ['.py', '.js', '.ts', '.java', '.cpp', '.c']:
|
|
305
|
+
try:
|
|
306
|
+
file_ast = self.ast_analyzer.analyze_file(str(file_path))
|
|
307
|
+
if file_ast:
|
|
308
|
+
# Store complete AST
|
|
309
|
+
self.vector_store._store_file_ast(file_ast)
|
|
310
|
+
|
|
311
|
+
# Store individual symbols
|
|
312
|
+
self.vector_store._store_symbols(file_ast.symbols)
|
|
313
|
+
results["symbols_extracted"] += len(file_ast.symbols)
|
|
314
|
+
|
|
315
|
+
# Store cross-references
|
|
316
|
+
self.vector_store._store_references(file_ast)
|
|
317
|
+
except Exception as e:
|
|
318
|
+
logger.warning(f"AST analysis failed for {file_path}: {e}")
|
|
319
|
+
|
|
320
|
+
def _get_file_history(
|
|
321
|
+
self,
|
|
322
|
+
repo_path: Path,
|
|
323
|
+
file_path: Path
|
|
324
|
+
) -> List[Dict[str, Any]]:
|
|
325
|
+
"""Get commit history for a specific file."""
|
|
326
|
+
result = subprocess.run(
|
|
327
|
+
["git", "log", "--follow", "--pretty=format:%H|%at|%an|%s", "--", str(file_path)],
|
|
328
|
+
cwd=repo_path,
|
|
329
|
+
capture_output=True,
|
|
330
|
+
text=True
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
if result.returncode != 0:
|
|
334
|
+
return []
|
|
335
|
+
|
|
336
|
+
history = []
|
|
337
|
+
for line in result.stdout.strip().split('\n'):
|
|
338
|
+
if line:
|
|
339
|
+
parts = line.split('|', 3)
|
|
340
|
+
if len(parts) >= 4:
|
|
341
|
+
history.append({
|
|
342
|
+
'hash': parts[0],
|
|
343
|
+
'timestamp': int(parts[1]),
|
|
344
|
+
'author': parts[2],
|
|
345
|
+
'message': parts[3]
|
|
346
|
+
})
|
|
347
|
+
|
|
348
|
+
return history
|
|
349
|
+
|
|
350
|
+
def _get_file_blame(
|
|
351
|
+
self,
|
|
352
|
+
repo_path: Path,
|
|
353
|
+
file_path: Path
|
|
354
|
+
) -> Dict[int, Dict[str, Any]]:
|
|
355
|
+
"""Get blame information for a file."""
|
|
356
|
+
result = subprocess.run(
|
|
357
|
+
["git", "blame", "--line-porcelain", "--", str(file_path)],
|
|
358
|
+
cwd=repo_path,
|
|
359
|
+
capture_output=True,
|
|
360
|
+
text=True
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
if result.returncode != 0:
|
|
364
|
+
return {}
|
|
365
|
+
|
|
366
|
+
blame_data = {}
|
|
367
|
+
current_commit = None
|
|
368
|
+
current_line = None
|
|
369
|
+
author = None
|
|
370
|
+
timestamp = None
|
|
371
|
+
|
|
372
|
+
for line in result.stdout.strip().split('\n'):
|
|
373
|
+
if line and not line.startswith('\t'):
|
|
374
|
+
parts = line.split(' ')
|
|
375
|
+
if len(parts) >= 3 and len(parts[0]) == 40: # SHA-1 hash
|
|
376
|
+
current_commit = parts[0]
|
|
377
|
+
current_line = int(parts[2])
|
|
378
|
+
elif line.startswith('author '):
|
|
379
|
+
author = line[7:]
|
|
380
|
+
elif line.startswith('author-time '):
|
|
381
|
+
timestamp = int(line[12:])
|
|
382
|
+
|
|
383
|
+
# We have all the data for this line
|
|
384
|
+
if current_line and author:
|
|
385
|
+
blame_data[current_line] = {
|
|
386
|
+
'commit': current_commit,
|
|
387
|
+
'author': author,
|
|
388
|
+
'timestamp': timestamp
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
return blame_data
|
|
392
|
+
|
|
393
|
+
def _index_commit(
|
|
394
|
+
self,
|
|
395
|
+
commit: GitCommit,
|
|
396
|
+
include_diffs: bool = True
|
|
397
|
+
):
|
|
398
|
+
"""Index a single commit."""
|
|
399
|
+
# Create commit document
|
|
400
|
+
commit_doc = f"""Git Commit: {commit.hash}
|
|
401
|
+
Author: {commit.author} <{commit.author_email}>
|
|
402
|
+
Date: {datetime.fromtimestamp(commit.timestamp).isoformat()}
|
|
403
|
+
Message: {commit.message}
|
|
404
|
+
|
|
405
|
+
Files changed: {len(commit.files)}
|
|
406
|
+
"""
|
|
407
|
+
|
|
408
|
+
for file_info in commit.files:
|
|
409
|
+
commit_doc += f"\n{file_info['status']}\t{file_info['filename']}"
|
|
410
|
+
|
|
411
|
+
# Index commit
|
|
412
|
+
metadata = {
|
|
413
|
+
'type': 'git_commit',
|
|
414
|
+
'commit_hash': commit.hash,
|
|
415
|
+
'author': commit.author,
|
|
416
|
+
'timestamp': commit.timestamp,
|
|
417
|
+
'file_count': len(commit.files)
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
self.vector_store.add_document(commit_doc, metadata)
|
|
421
|
+
|
|
422
|
+
# Index diffs if requested
|
|
423
|
+
if include_diffs:
|
|
424
|
+
for file_info in commit.files:
|
|
425
|
+
self._index_commit_diff(commit, file_info['filename'])
|
|
426
|
+
|
|
427
|
+
def _index_commit_diff(self, commit: GitCommit, filename: str):
|
|
428
|
+
"""Index the diff for a specific file in a commit."""
|
|
429
|
+
# This is a simplified version - in practice you'd want to
|
|
430
|
+
# parse the actual diff and store meaningful chunks
|
|
431
|
+
metadata = {
|
|
432
|
+
'type': 'git_diff',
|
|
433
|
+
'commit_hash': commit.hash,
|
|
434
|
+
'filename': filename,
|
|
435
|
+
'author': commit.author,
|
|
436
|
+
'timestamp': commit.timestamp
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
# Create a document representing this change
|
|
440
|
+
diff_doc = f"""File: {filename}
|
|
441
|
+
Commit: {commit.hash}
|
|
442
|
+
Author: {commit.author}
|
|
443
|
+
Message: {commit.message}
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
self.vector_store.add_document(diff_doc, metadata)
|
|
447
|
+
|
|
448
|
+
def _index_repository_metadata(
|
|
449
|
+
self,
|
|
450
|
+
repo_path: Path,
|
|
451
|
+
results: Dict[str, Any]
|
|
452
|
+
):
|
|
453
|
+
"""Index overall repository metadata."""
|
|
454
|
+
# Get repository info
|
|
455
|
+
remote_result = subprocess.run(
|
|
456
|
+
["git", "remote", "get-url", "origin"],
|
|
457
|
+
cwd=repo_path,
|
|
458
|
+
capture_output=True,
|
|
459
|
+
text=True
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
remote_url = remote_result.stdout.strip() if remote_result.returncode == 0 else None
|
|
463
|
+
|
|
464
|
+
# Create repository summary document
|
|
465
|
+
repo_doc = f"""Repository: {repo_path.name}
|
|
466
|
+
Path: {repo_path}
|
|
467
|
+
Remote: {remote_url or 'No remote'}
|
|
468
|
+
Current Commit: {results.get('current_commit', 'Unknown')}
|
|
469
|
+
|
|
470
|
+
Statistics:
|
|
471
|
+
- Files indexed: {results['files_indexed']}
|
|
472
|
+
- Commits processed: {results['commits_processed']}
|
|
473
|
+
- Symbols extracted: {results['symbols_extracted']}
|
|
474
|
+
- Diffs indexed: {results['diffs_indexed']}
|
|
475
|
+
"""
|
|
476
|
+
|
|
477
|
+
metadata = {
|
|
478
|
+
'type': 'repository',
|
|
479
|
+
'name': repo_path.name,
|
|
480
|
+
'path': str(repo_path),
|
|
481
|
+
'remote_url': remote_url,
|
|
482
|
+
**results
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
self.vector_store.add_document(repo_doc, metadata)
|