awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +12 -0
- awslabs/git_repo_research_mcp_server/__init__.py +13 -0
- awslabs/git_repo_research_mcp_server/defaults.py +347 -0
- awslabs/git_repo_research_mcp_server/embeddings.py +66 -0
- awslabs/git_repo_research_mcp_server/github_search.py +471 -0
- awslabs/git_repo_research_mcp_server/indexer.py +860 -0
- awslabs/git_repo_research_mcp_server/models.py +291 -0
- awslabs/git_repo_research_mcp_server/repository.py +321 -0
- awslabs/git_repo_research_mcp_server/search.py +350 -0
- awslabs/git_repo_research_mcp_server/server.py +914 -0
- awslabs/git_repo_research_mcp_server/utils.py +396 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/METADATA +190 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/RECORD +17 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/WHEEL +4 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/LICENSE +175 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/NOTICE +2 -0
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
4
|
+
# with the License. A copy of the License is located at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
9
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
10
|
+
# and limitations under the License.
|
|
11
|
+
"""Search functionality for Git Repository Research MCP Server.
|
|
12
|
+
|
|
13
|
+
This module provides functionality for searching within indexed Git repositories
|
|
14
|
+
using LangChain's FAISS implementation.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import time
|
|
19
|
+
from awslabs.git_repo_research_mcp_server.defaults import Constants
|
|
20
|
+
from awslabs.git_repo_research_mcp_server.embeddings import get_embedding_model
|
|
21
|
+
from awslabs.git_repo_research_mcp_server.indexer import (
|
|
22
|
+
IndexConfig,
|
|
23
|
+
get_docstore_dict_size,
|
|
24
|
+
get_repository_indexer,
|
|
25
|
+
)
|
|
26
|
+
from awslabs.git_repo_research_mcp_server.models import (
|
|
27
|
+
EmbeddingModel,
|
|
28
|
+
SearchResponse,
|
|
29
|
+
SearchResult,
|
|
30
|
+
)
|
|
31
|
+
from loguru import logger
|
|
32
|
+
from typing import Optional
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class RepositorySearcher:
|
|
36
|
+
"""Searcher for indexed Git repositories using LangChain.
|
|
37
|
+
|
|
38
|
+
This class provides methods for searching within indexed Git repositories.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
embedding_model: str = EmbeddingModel.AMAZON_TITAN_EMBED_TEXT_V2,
|
|
44
|
+
aws_region: Optional[str] = None,
|
|
45
|
+
aws_profile: Optional[str] = None,
|
|
46
|
+
index_dir: Optional[str] = None,
|
|
47
|
+
):
|
|
48
|
+
"""Initialize the repository searcher.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
embedding_model: ID of the embedding model to use
|
|
52
|
+
aws_region: AWS region to use (optional, uses default if not provided)
|
|
53
|
+
aws_profile: AWS profile to use (optional, uses default if not provided)
|
|
54
|
+
index_dir: Directory where indices are stored (optional, uses default if not provided)
|
|
55
|
+
"""
|
|
56
|
+
self.embedding_model = embedding_model
|
|
57
|
+
self.aws_region = aws_region
|
|
58
|
+
self.aws_profile = aws_profile
|
|
59
|
+
self.index_dir = index_dir or os.path.expanduser(f'~/{Constants.DEFAULT_INDEX_DIR}')
|
|
60
|
+
|
|
61
|
+
self.config = IndexConfig(
|
|
62
|
+
embedding_model=embedding_model,
|
|
63
|
+
aws_region=aws_region,
|
|
64
|
+
aws_profile=aws_profile,
|
|
65
|
+
index_dir=index_dir or os.path.expanduser(f'~/{Constants.DEFAULT_INDEX_DIR}'),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Initialize the embedding generator
|
|
69
|
+
self.embedding_generator = get_embedding_model(
|
|
70
|
+
model_id=embedding_model,
|
|
71
|
+
aws_region=aws_region,
|
|
72
|
+
aws_profile=aws_profile,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Initialize the repository indexer
|
|
76
|
+
self.repository_indexer = get_repository_indexer(self.config)
|
|
77
|
+
|
|
78
|
+
def list_repository_files(self, repository_name: str) -> Optional[str]:
|
|
79
|
+
"""Generate a directory tree structure of the repository files.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
repository_name: Name of the repository
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
String representation of the directory tree, or None if repository not found
|
|
86
|
+
"""
|
|
87
|
+
# Get the index path for the repository
|
|
88
|
+
index_path = self.repository_indexer._get_index_path(repository_name)
|
|
89
|
+
|
|
90
|
+
# Construct the path to the repository directory
|
|
91
|
+
repo_files_path = os.path.join(index_path, 'repository')
|
|
92
|
+
|
|
93
|
+
# Check if the repository directory exists
|
|
94
|
+
if not os.path.exists(repo_files_path) or not os.path.isdir(repo_files_path):
|
|
95
|
+
logger.warning(f'Repository directory not found: {repo_files_path}')
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
# Generate the directory tree
|
|
100
|
+
tree = self._generate_directory_tree(repo_files_path)
|
|
101
|
+
return tree
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f'Error generating directory tree for {repository_name}: {e}')
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def _generate_directory_tree(self, path: str) -> str:
|
|
107
|
+
"""Generate a directory tree structure for a given path.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
path: Path to the directory
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
String representation of the directory tree
|
|
114
|
+
"""
|
|
115
|
+
# Get the base name of the path
|
|
116
|
+
base_name = os.path.basename(path)
|
|
117
|
+
|
|
118
|
+
# Initialize the tree string
|
|
119
|
+
tree = f'Directory structure:\n└── {base_name}/\n'
|
|
120
|
+
|
|
121
|
+
# Generate the tree recursively
|
|
122
|
+
tree += self._generate_tree(path, '', base_name)
|
|
123
|
+
|
|
124
|
+
return tree
|
|
125
|
+
|
|
126
|
+
def _generate_tree(self, path: str, prefix: str, base_path: str) -> str:
|
|
127
|
+
"""Recursively generate a directory tree structure.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
path: Path to the current directory
|
|
131
|
+
prefix: Prefix for the current line
|
|
132
|
+
base_path: Base path to remove from the full path
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
String representation of the directory tree
|
|
136
|
+
"""
|
|
137
|
+
# Get all entries in the directory
|
|
138
|
+
entries = sorted(os.listdir(path))
|
|
139
|
+
|
|
140
|
+
# Filter out hidden files and directories
|
|
141
|
+
entries = [e for e in entries if not e.startswith('.')]
|
|
142
|
+
|
|
143
|
+
# Initialize the tree string
|
|
144
|
+
tree = ''
|
|
145
|
+
|
|
146
|
+
# Process each entry
|
|
147
|
+
for i, entry in enumerate(entries):
|
|
148
|
+
# Construct the full path
|
|
149
|
+
full_path = os.path.join(path, entry)
|
|
150
|
+
|
|
151
|
+
# Check if this is the last entry
|
|
152
|
+
is_last = i == len(entries) - 1
|
|
153
|
+
|
|
154
|
+
# Add the entry to the tree
|
|
155
|
+
if is_last:
|
|
156
|
+
tree += f'{prefix} └── '
|
|
157
|
+
new_prefix = prefix + ' '
|
|
158
|
+
else:
|
|
159
|
+
tree += f'{prefix} ├── '
|
|
160
|
+
new_prefix = prefix + ' │'
|
|
161
|
+
|
|
162
|
+
# Check if the entry is a directory
|
|
163
|
+
if os.path.isdir(full_path):
|
|
164
|
+
# Add the directory name
|
|
165
|
+
tree += f'{entry}/\n'
|
|
166
|
+
|
|
167
|
+
# Recursively process the directory
|
|
168
|
+
# Always include the directory in the tree, even if it's empty
|
|
169
|
+
subtree = self._generate_tree(full_path, new_prefix, base_path)
|
|
170
|
+
tree += subtree
|
|
171
|
+
else:
|
|
172
|
+
# Add the file name
|
|
173
|
+
tree += f'{entry}\n'
|
|
174
|
+
|
|
175
|
+
return tree
|
|
176
|
+
|
|
177
|
+
def search(
|
|
178
|
+
self,
|
|
179
|
+
index_path: str,
|
|
180
|
+
query: str,
|
|
181
|
+
limit: int = 10,
|
|
182
|
+
threshold: float = 0.0,
|
|
183
|
+
) -> SearchResponse:
|
|
184
|
+
"""Search within an indexed repository using LangChain's FAISS implementation.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
index_path: Path to the index file or repository name
|
|
188
|
+
query: Search query text
|
|
189
|
+
limit: Maximum number of results to return
|
|
190
|
+
threshold: Similarity threshold for results (0.0-1.0)
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
SearchResponse object with search results
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
Exception: If search fails
|
|
197
|
+
"""
|
|
198
|
+
start_time = time.time()
|
|
199
|
+
# Initialize repository_name with a default value outside the try block
|
|
200
|
+
repository_name = 'unknown'
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
# Check if index_path is a repository name or a file path
|
|
204
|
+
if os.path.exists(index_path) and os.path.isdir(index_path):
|
|
205
|
+
# It's a directory path, extract the repository name
|
|
206
|
+
repository_name = os.path.basename(index_path)
|
|
207
|
+
else:
|
|
208
|
+
# It's a repository name
|
|
209
|
+
repository_name = index_path
|
|
210
|
+
index_path = self.repository_indexer._get_index_path(repository_name)
|
|
211
|
+
|
|
212
|
+
# Load the index and chunk map
|
|
213
|
+
vector_store = self.repository_indexer.load_index_without_pickle(repository_name)
|
|
214
|
+
if vector_store is None:
|
|
215
|
+
logger.error(f'Index or chunk map not found for repository {repository_name}')
|
|
216
|
+
# Set repository_directory even if index is not found
|
|
217
|
+
repo_files_path = os.path.join(index_path, 'repository')
|
|
218
|
+
return SearchResponse(
|
|
219
|
+
results=[],
|
|
220
|
+
query=query,
|
|
221
|
+
index_path=index_path,
|
|
222
|
+
repository_name=repository_name,
|
|
223
|
+
repository_directory=repo_files_path,
|
|
224
|
+
total_results=0,
|
|
225
|
+
execution_time_ms=int((time.time() - start_time) * 1000),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Use LangChain's similarity search
|
|
229
|
+
logger.info(f"Searching for '{query}' in repository {repository_name}")
|
|
230
|
+
|
|
231
|
+
# Debug: Print vector store info
|
|
232
|
+
logger.info(f'Vector store type: {type(vector_store)}')
|
|
233
|
+
logger.info(
|
|
234
|
+
f'Vector store docstore size: {get_docstore_dict_size(vector_store.docstore)}'
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Use the same approach as in the test script
|
|
238
|
+
try:
|
|
239
|
+
# Use similarity_search directly
|
|
240
|
+
langchain_results = vector_store.similarity_search(query, k=limit)
|
|
241
|
+
|
|
242
|
+
# Process the results
|
|
243
|
+
results = []
|
|
244
|
+
if langchain_results:
|
|
245
|
+
logger.info(f'Found {len(langchain_results)} results')
|
|
246
|
+
for doc in langchain_results:
|
|
247
|
+
# Get file path from document metadata
|
|
248
|
+
file_path = doc.metadata.get('source', 'unknown')
|
|
249
|
+
|
|
250
|
+
# Create a search result
|
|
251
|
+
result = SearchResult(
|
|
252
|
+
file_path=file_path,
|
|
253
|
+
content=doc.page_content,
|
|
254
|
+
score=1.0, # Default score since we're not using similarity_search_with_score
|
|
255
|
+
line_numbers=None, # We don't track line numbers currently
|
|
256
|
+
metadata={'chunk_id': str(doc.metadata.get('chunk_id', -1))},
|
|
257
|
+
)
|
|
258
|
+
results.append(result)
|
|
259
|
+
else:
|
|
260
|
+
logger.info('No results found')
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.error(f'Error with similarity_search: {e}')
|
|
263
|
+
# Try with similarity_search_with_score as a fallback
|
|
264
|
+
try:
|
|
265
|
+
logger.info('Trying with similarity_search_with_score as fallback')
|
|
266
|
+
langchain_results = vector_store.similarity_search_with_score(query, k=limit)
|
|
267
|
+
|
|
268
|
+
# Process the results
|
|
269
|
+
results = []
|
|
270
|
+
for doc, score in langchain_results:
|
|
271
|
+
# Get file path from document metadata
|
|
272
|
+
file_path = doc.metadata.get('source', 'unknown')
|
|
273
|
+
|
|
274
|
+
# Convert score to similarity (0-1 range)
|
|
275
|
+
similarity = 1.0 - min(1.0, score / 2.0)
|
|
276
|
+
|
|
277
|
+
# Create a search result
|
|
278
|
+
result = SearchResult(
|
|
279
|
+
file_path=file_path,
|
|
280
|
+
content=doc.page_content,
|
|
281
|
+
score=float(similarity),
|
|
282
|
+
line_numbers=None, # We don't track line numbers currently
|
|
283
|
+
metadata={
|
|
284
|
+
'distance': str(float(score)),
|
|
285
|
+
'chunk_id': str(doc.metadata.get('chunk_id', -1)),
|
|
286
|
+
},
|
|
287
|
+
)
|
|
288
|
+
results.append(result)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f'Error with similarity_search_with_score fallback: {e}')
|
|
291
|
+
results = []
|
|
292
|
+
|
|
293
|
+
execution_time_ms = int((time.time() - start_time) * 1000)
|
|
294
|
+
logger.info(f'Search completed in {execution_time_ms}ms, found {len(results)} results')
|
|
295
|
+
|
|
296
|
+
# Add repository directory information to the response
|
|
297
|
+
repo_files_path = os.path.join(index_path, 'repository')
|
|
298
|
+
|
|
299
|
+
# Always set repository_directory to the expected path
|
|
300
|
+
repository_directory = repo_files_path
|
|
301
|
+
|
|
302
|
+
return SearchResponse(
|
|
303
|
+
results=results,
|
|
304
|
+
query=query,
|
|
305
|
+
index_path=index_path,
|
|
306
|
+
repository_name=repository_name,
|
|
307
|
+
repository_directory=repository_directory,
|
|
308
|
+
total_results=len(results),
|
|
309
|
+
execution_time_ms=execution_time_ms,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
except Exception as e:
|
|
313
|
+
logger.error(f'Error searching repository: {e}')
|
|
314
|
+
# repository_name is already defined outside the try block
|
|
315
|
+
# Set repository_directory even in case of error
|
|
316
|
+
repo_files_path = os.path.join(index_path, 'repository')
|
|
317
|
+
return SearchResponse(
|
|
318
|
+
results=[],
|
|
319
|
+
query=query,
|
|
320
|
+
index_path=index_path,
|
|
321
|
+
repository_name=repository_name,
|
|
322
|
+
repository_directory=repo_files_path,
|
|
323
|
+
total_results=0,
|
|
324
|
+
execution_time_ms=int((time.time() - start_time) * 1000),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def get_repository_searcher(
|
|
329
|
+
embedding_model: str = EmbeddingModel.AMAZON_TITAN_EMBED_TEXT_V2,
|
|
330
|
+
aws_region: Optional[str] = None,
|
|
331
|
+
aws_profile: Optional[str] = None,
|
|
332
|
+
index_dir: Optional[str] = None,
|
|
333
|
+
) -> RepositorySearcher:
|
|
334
|
+
"""Factory method to return a repository searcher.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
embedding_model: ID of the embedding model to use
|
|
338
|
+
aws_region: AWS region to use (optional, uses default if not provided)
|
|
339
|
+
aws_profile: AWS profile to use (optional, uses default if not provided)
|
|
340
|
+
index_dir: Directory where indices are stored (optional, uses default if not provided)
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
RepositorySearcher instance
|
|
344
|
+
"""
|
|
345
|
+
return RepositorySearcher(
|
|
346
|
+
embedding_model=embedding_model,
|
|
347
|
+
aws_region=aws_region,
|
|
348
|
+
aws_profile=aws_profile,
|
|
349
|
+
index_dir=index_dir,
|
|
350
|
+
)
|