awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+ """Search functionality for Git Repository Research MCP Server.
12
+
13
+ This module provides functionality for searching within indexed Git repositories
14
+ using LangChain's FAISS implementation.
15
+ """
16
+
17
+ import os
18
+ import time
19
+ from awslabs.git_repo_research_mcp_server.defaults import Constants
20
+ from awslabs.git_repo_research_mcp_server.embeddings import get_embedding_model
21
+ from awslabs.git_repo_research_mcp_server.indexer import (
22
+ IndexConfig,
23
+ get_docstore_dict_size,
24
+ get_repository_indexer,
25
+ )
26
+ from awslabs.git_repo_research_mcp_server.models import (
27
+ EmbeddingModel,
28
+ SearchResponse,
29
+ SearchResult,
30
+ )
31
+ from loguru import logger
32
+ from typing import Optional
33
+
34
+
35
+ class RepositorySearcher:
36
+ """Searcher for indexed Git repositories using LangChain.
37
+
38
+ This class provides methods for searching within indexed Git repositories.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ embedding_model: str = EmbeddingModel.AMAZON_TITAN_EMBED_TEXT_V2,
44
+ aws_region: Optional[str] = None,
45
+ aws_profile: Optional[str] = None,
46
+ index_dir: Optional[str] = None,
47
+ ):
48
+ """Initialize the repository searcher.
49
+
50
+ Args:
51
+ embedding_model: ID of the embedding model to use
52
+ aws_region: AWS region to use (optional, uses default if not provided)
53
+ aws_profile: AWS profile to use (optional, uses default if not provided)
54
+ index_dir: Directory where indices are stored (optional, uses default if not provided)
55
+ """
56
+ self.embedding_model = embedding_model
57
+ self.aws_region = aws_region
58
+ self.aws_profile = aws_profile
59
+ self.index_dir = index_dir or os.path.expanduser(f'~/{Constants.DEFAULT_INDEX_DIR}')
60
+
61
+ self.config = IndexConfig(
62
+ embedding_model=embedding_model,
63
+ aws_region=aws_region,
64
+ aws_profile=aws_profile,
65
+ index_dir=index_dir or os.path.expanduser(f'~/{Constants.DEFAULT_INDEX_DIR}'),
66
+ )
67
+
68
+ # Initialize the embedding generator
69
+ self.embedding_generator = get_embedding_model(
70
+ model_id=embedding_model,
71
+ aws_region=aws_region,
72
+ aws_profile=aws_profile,
73
+ )
74
+
75
+ # Initialize the repository indexer
76
+ self.repository_indexer = get_repository_indexer(self.config)
77
+
78
+ def list_repository_files(self, repository_name: str) -> Optional[str]:
79
+ """Generate a directory tree structure of the repository files.
80
+
81
+ Args:
82
+ repository_name: Name of the repository
83
+
84
+ Returns:
85
+ String representation of the directory tree, or None if repository not found
86
+ """
87
+ # Get the index path for the repository
88
+ index_path = self.repository_indexer._get_index_path(repository_name)
89
+
90
+ # Construct the path to the repository directory
91
+ repo_files_path = os.path.join(index_path, 'repository')
92
+
93
+ # Check if the repository directory exists
94
+ if not os.path.exists(repo_files_path) or not os.path.isdir(repo_files_path):
95
+ logger.warning(f'Repository directory not found: {repo_files_path}')
96
+ return None
97
+
98
+ try:
99
+ # Generate the directory tree
100
+ tree = self._generate_directory_tree(repo_files_path)
101
+ return tree
102
+ except Exception as e:
103
+ logger.error(f'Error generating directory tree for {repository_name}: {e}')
104
+ return None
105
+
106
+ def _generate_directory_tree(self, path: str) -> str:
107
+ """Generate a directory tree structure for a given path.
108
+
109
+ Args:
110
+ path: Path to the directory
111
+
112
+ Returns:
113
+ String representation of the directory tree
114
+ """
115
+ # Get the base name of the path
116
+ base_name = os.path.basename(path)
117
+
118
+ # Initialize the tree string
119
+ tree = f'Directory structure:\n└── {base_name}/\n'
120
+
121
+ # Generate the tree recursively
122
+ tree += self._generate_tree(path, '', base_name)
123
+
124
+ return tree
125
+
126
+ def _generate_tree(self, path: str, prefix: str, base_path: str) -> str:
127
+ """Recursively generate a directory tree structure.
128
+
129
+ Args:
130
+ path: Path to the current directory
131
+ prefix: Prefix for the current line
132
+ base_path: Base path to remove from the full path
133
+
134
+ Returns:
135
+ String representation of the directory tree
136
+ """
137
+ # Get all entries in the directory
138
+ entries = sorted(os.listdir(path))
139
+
140
+ # Filter out hidden files and directories
141
+ entries = [e for e in entries if not e.startswith('.')]
142
+
143
+ # Initialize the tree string
144
+ tree = ''
145
+
146
+ # Process each entry
147
+ for i, entry in enumerate(entries):
148
+ # Construct the full path
149
+ full_path = os.path.join(path, entry)
150
+
151
+ # Check if this is the last entry
152
+ is_last = i == len(entries) - 1
153
+
154
+ # Add the entry to the tree
155
+ if is_last:
156
+ tree += f'{prefix} └── '
157
+ new_prefix = prefix + ' '
158
+ else:
159
+ tree += f'{prefix} ├── '
160
+ new_prefix = prefix + ' │'
161
+
162
+ # Check if the entry is a directory
163
+ if os.path.isdir(full_path):
164
+ # Add the directory name
165
+ tree += f'{entry}/\n'
166
+
167
+ # Recursively process the directory
168
+ # Always include the directory in the tree, even if it's empty
169
+ subtree = self._generate_tree(full_path, new_prefix, base_path)
170
+ tree += subtree
171
+ else:
172
+ # Add the file name
173
+ tree += f'{entry}\n'
174
+
175
+ return tree
176
+
177
+ def search(
178
+ self,
179
+ index_path: str,
180
+ query: str,
181
+ limit: int = 10,
182
+ threshold: float = 0.0,
183
+ ) -> SearchResponse:
184
+ """Search within an indexed repository using LangChain's FAISS implementation.
185
+
186
+ Args:
187
+ index_path: Path to the index file or repository name
188
+ query: Search query text
189
+ limit: Maximum number of results to return
190
+ threshold: Similarity threshold for results (0.0-1.0)
191
+
192
+ Returns:
193
+ SearchResponse object with search results
194
+
195
+ Raises:
196
+ Exception: If search fails
197
+ """
198
+ start_time = time.time()
199
+ # Initialize repository_name with a default value outside the try block
200
+ repository_name = 'unknown'
201
+
202
+ try:
203
+ # Check if index_path is a repository name or a file path
204
+ if os.path.exists(index_path) and os.path.isdir(index_path):
205
+ # It's a directory path, extract the repository name
206
+ repository_name = os.path.basename(index_path)
207
+ else:
208
+ # It's a repository name
209
+ repository_name = index_path
210
+ index_path = self.repository_indexer._get_index_path(repository_name)
211
+
212
+ # Load the index and chunk map
213
+ vector_store = self.repository_indexer.load_index_without_pickle(repository_name)
214
+ if vector_store is None:
215
+ logger.error(f'Index or chunk map not found for repository {repository_name}')
216
+ # Set repository_directory even if index is not found
217
+ repo_files_path = os.path.join(index_path, 'repository')
218
+ return SearchResponse(
219
+ results=[],
220
+ query=query,
221
+ index_path=index_path,
222
+ repository_name=repository_name,
223
+ repository_directory=repo_files_path,
224
+ total_results=0,
225
+ execution_time_ms=int((time.time() - start_time) * 1000),
226
+ )
227
+
228
+ # Use LangChain's similarity search
229
+ logger.info(f"Searching for '{query}' in repository {repository_name}")
230
+
231
+ # Debug: Print vector store info
232
+ logger.info(f'Vector store type: {type(vector_store)}')
233
+ logger.info(
234
+ f'Vector store docstore size: {get_docstore_dict_size(vector_store.docstore)}'
235
+ )
236
+
237
+ # Use the same approach as in the test script
238
+ try:
239
+ # Use similarity_search directly
240
+ langchain_results = vector_store.similarity_search(query, k=limit)
241
+
242
+ # Process the results
243
+ results = []
244
+ if langchain_results:
245
+ logger.info(f'Found {len(langchain_results)} results')
246
+ for doc in langchain_results:
247
+ # Get file path from document metadata
248
+ file_path = doc.metadata.get('source', 'unknown')
249
+
250
+ # Create a search result
251
+ result = SearchResult(
252
+ file_path=file_path,
253
+ content=doc.page_content,
254
+ score=1.0, # Default score since we're not using similarity_search_with_score
255
+ line_numbers=None, # We don't track line numbers currently
256
+ metadata={'chunk_id': str(doc.metadata.get('chunk_id', -1))},
257
+ )
258
+ results.append(result)
259
+ else:
260
+ logger.info('No results found')
261
+ except Exception as e:
262
+ logger.error(f'Error with similarity_search: {e}')
263
+ # Try with similarity_search_with_score as a fallback
264
+ try:
265
+ logger.info('Trying with similarity_search_with_score as fallback')
266
+ langchain_results = vector_store.similarity_search_with_score(query, k=limit)
267
+
268
+ # Process the results
269
+ results = []
270
+ for doc, score in langchain_results:
271
+ # Get file path from document metadata
272
+ file_path = doc.metadata.get('source', 'unknown')
273
+
274
+ # Convert score to similarity (0-1 range)
275
+ similarity = 1.0 - min(1.0, score / 2.0)
276
+
277
+ # Create a search result
278
+ result = SearchResult(
279
+ file_path=file_path,
280
+ content=doc.page_content,
281
+ score=float(similarity),
282
+ line_numbers=None, # We don't track line numbers currently
283
+ metadata={
284
+ 'distance': str(float(score)),
285
+ 'chunk_id': str(doc.metadata.get('chunk_id', -1)),
286
+ },
287
+ )
288
+ results.append(result)
289
+ except Exception as e:
290
+ logger.error(f'Error with similarity_search_with_score fallback: {e}')
291
+ results = []
292
+
293
+ execution_time_ms = int((time.time() - start_time) * 1000)
294
+ logger.info(f'Search completed in {execution_time_ms}ms, found {len(results)} results')
295
+
296
+ # Add repository directory information to the response
297
+ repo_files_path = os.path.join(index_path, 'repository')
298
+
299
+ # Always set repository_directory to the expected path
300
+ repository_directory = repo_files_path
301
+
302
+ return SearchResponse(
303
+ results=results,
304
+ query=query,
305
+ index_path=index_path,
306
+ repository_name=repository_name,
307
+ repository_directory=repository_directory,
308
+ total_results=len(results),
309
+ execution_time_ms=execution_time_ms,
310
+ )
311
+
312
+ except Exception as e:
313
+ logger.error(f'Error searching repository: {e}')
314
+ # repository_name is already defined outside the try block
315
+ # Set repository_directory even in case of error
316
+ repo_files_path = os.path.join(index_path, 'repository')
317
+ return SearchResponse(
318
+ results=[],
319
+ query=query,
320
+ index_path=index_path,
321
+ repository_name=repository_name,
322
+ repository_directory=repo_files_path,
323
+ total_results=0,
324
+ execution_time_ms=int((time.time() - start_time) * 1000),
325
+ )
326
+
327
+
328
+ def get_repository_searcher(
329
+ embedding_model: str = EmbeddingModel.AMAZON_TITAN_EMBED_TEXT_V2,
330
+ aws_region: Optional[str] = None,
331
+ aws_profile: Optional[str] = None,
332
+ index_dir: Optional[str] = None,
333
+ ) -> RepositorySearcher:
334
+ """Factory method to return a repository searcher.
335
+
336
+ Args:
337
+ embedding_model: ID of the embedding model to use
338
+ aws_region: AWS region to use (optional, uses default if not provided)
339
+ aws_profile: AWS profile to use (optional, uses default if not provided)
340
+ index_dir: Directory where indices are stored (optional, uses default if not provided)
341
+
342
+ Returns:
343
+ RepositorySearcher instance
344
+ """
345
+ return RepositorySearcher(
346
+ embedding_model=embedding_model,
347
+ aws_region=aws_region,
348
+ aws_profile=aws_profile,
349
+ index_dir=index_dir,
350
+ )