awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +12 -0
- awslabs/git_repo_research_mcp_server/__init__.py +13 -0
- awslabs/git_repo_research_mcp_server/defaults.py +347 -0
- awslabs/git_repo_research_mcp_server/embeddings.py +66 -0
- awslabs/git_repo_research_mcp_server/github_search.py +471 -0
- awslabs/git_repo_research_mcp_server/indexer.py +860 -0
- awslabs/git_repo_research_mcp_server/models.py +291 -0
- awslabs/git_repo_research_mcp_server/repository.py +321 -0
- awslabs/git_repo_research_mcp_server/search.py +350 -0
- awslabs/git_repo_research_mcp_server/server.py +914 -0
- awslabs/git_repo_research_mcp_server/utils.py +396 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/METADATA +190 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/RECORD +17 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/WHEEL +4 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/LICENSE +175 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/NOTICE +2 -0
|
@@ -0,0 +1,914 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
4
|
+
# with the License. A copy of the License is located at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
9
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
10
|
+
# and limitations under the License.
|
|
11
|
+
"""awslabs git-repo-research MCP Server implementation."""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import mimetypes
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
from awslabs.git_repo_research_mcp_server.defaults import Constants
|
|
19
|
+
from awslabs.git_repo_research_mcp_server.github_search import (
|
|
20
|
+
github_repo_search_wrapper,
|
|
21
|
+
)
|
|
22
|
+
from awslabs.git_repo_research_mcp_server.indexer import (
|
|
23
|
+
IndexConfig,
|
|
24
|
+
RepositoryConfig,
|
|
25
|
+
get_repository_indexer,
|
|
26
|
+
)
|
|
27
|
+
from awslabs.git_repo_research_mcp_server.models import (
|
|
28
|
+
DeleteRepositoryResponse,
|
|
29
|
+
EmbeddingModel,
|
|
30
|
+
GitHubRepoSearchResponse,
|
|
31
|
+
GitHubRepoSearchResult,
|
|
32
|
+
)
|
|
33
|
+
from awslabs.git_repo_research_mcp_server.search import get_repository_searcher
|
|
34
|
+
from awslabs.git_repo_research_mcp_server.utils import (
|
|
35
|
+
DateTimeEncoder,
|
|
36
|
+
delete_indexed_repository,
|
|
37
|
+
list_indexed_repositories,
|
|
38
|
+
)
|
|
39
|
+
from datetime import datetime
|
|
40
|
+
from loguru import logger
|
|
41
|
+
from mcp.server.fastmcp import Context, FastMCP, Image
|
|
42
|
+
from mcp.types import ImageContent
|
|
43
|
+
from pydantic import Field
|
|
44
|
+
from typing import Dict, List, Optional, Union
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Configure logging
|
|
48
|
+
logger.remove()
|
|
49
|
+
logger.add(sys.stderr, level=os.getenv('FASTMCP_LOG_LEVEL', 'INFO'))
|
|
50
|
+
|
|
51
|
+
# Create the MCP server
|
|
52
|
+
mcp = FastMCP(
|
|
53
|
+
'Git Repository Research MCP Server',
|
|
54
|
+
instructions="""
|
|
55
|
+
# Git Repository Research MCP Server
|
|
56
|
+
|
|
57
|
+
This MCP server provides tools and resources for indexing and searching Git repositories using semantic search.
|
|
58
|
+
|
|
59
|
+
## Important Note on Repository Names
|
|
60
|
+
|
|
61
|
+
When working with repository names that include organization (e.g., "awslabs/mcp"), you MUST use underscores instead of slashes in URIs (e.g., "awslabs_mcp") for compatibility. This affects:
|
|
62
|
+
- How repositories are stored in the index directory
|
|
63
|
+
- How repositories are referenced in metadata.json
|
|
64
|
+
- How repositories should be referenced in URIs and search queries
|
|
65
|
+
|
|
66
|
+
IMPORTANT: Always use underscores in URIs (e.g., `repositories://awslabs_mcp/summary`), NOT slashes.
|
|
67
|
+
|
|
68
|
+
## Available Tools
|
|
69
|
+
|
|
70
|
+
### create_research_repository
|
|
71
|
+
Build a FAISS index for a Git repository.
|
|
72
|
+
|
|
73
|
+
### search_research_repository
|
|
74
|
+
Perform semantic search within an indexed repository.
|
|
75
|
+
|
|
76
|
+
### delete_research_repository
|
|
77
|
+
Delete an indexed repository.
|
|
78
|
+
|
|
79
|
+
### search_research_repository_suggestions
|
|
80
|
+
Search for GitHub repositories based on keywords, scoped to specific organizations.
|
|
81
|
+
|
|
82
|
+
### access_file
|
|
83
|
+
Access file or directory contents. This tool is recommended for accessing files with complex paths, especially those containing slashes in repository names (e.g., "awslabs/mcp/repository/README.md").
|
|
84
|
+
|
|
85
|
+
## Available Resources
|
|
86
|
+
|
|
87
|
+
### repositories://{repository_name}/summary
|
|
88
|
+
Get a summary of an indexed repository including directory structure and helpful files (READMEs, etc.). This is particularly useful for understanding the structure of the repository and quickly finding important documentation. The repository_name can be a simple name or in the format "org_repo".
|
|
89
|
+
|
|
90
|
+
### repositories://
|
|
91
|
+
List all indexed repositories with detailed information including file counts, chunk counts, file types, etc.
|
|
92
|
+
|
|
93
|
+
### repositories://{index_directory}
|
|
94
|
+
List all indexed repositories from a specific index directory.
|
|
95
|
+
|
|
96
|
+
## Usage Examples
|
|
97
|
+
|
|
98
|
+
### Summarizing or describing purpose/objective/goals of the specific repository (e.g. 'What does this repo do?' or 'What are the main features?').
|
|
99
|
+
```
|
|
100
|
+
# Access the repository summary resource
|
|
101
|
+
repositories://awslabs_mcp/summary
|
|
102
|
+
|
|
103
|
+
# Or for a simple repository name
|
|
104
|
+
repositories://my-repo-name/summary
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Then after identifying the main files of interest (e.g. README.md, diagrams, etc.), you can further investigate using other tools.
|
|
108
|
+
|
|
109
|
+
### Indexing a Repository
|
|
110
|
+
```
|
|
111
|
+
create_research_repository(repository_path="https://github.com/username/repo.git")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Describing the Structure of a Repository (Directory Tree Format)
|
|
115
|
+
```
|
|
116
|
+
# Access the repository summary resource (with organization name)
|
|
117
|
+
repositories://awslabs_mcp/summary
|
|
118
|
+
|
|
119
|
+
# Or without organization name
|
|
120
|
+
repositories://my-repo-name/summary
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Searching a Repository
|
|
124
|
+
```
|
|
125
|
+
search_research_repository(index_path="repo_name", query="How does the authentication system work?")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Listing Indexed Repositories
|
|
129
|
+
```
|
|
130
|
+
# Default listing
|
|
131
|
+
repositories://
|
|
132
|
+
|
|
133
|
+
# Listing from a specific directory
|
|
134
|
+
repositories:///path/to/custom/index/directory
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Accessing Files
|
|
138
|
+
```
|
|
139
|
+
# Using the tool
|
|
140
|
+
access_file(filepath="awslabs/mcp/repository/README.md")
|
|
141
|
+
access_file(filepath="/Users/username/.git_repo_research/repo_name/repository/src/file.py")
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Deleting a Repository
|
|
145
|
+
```
|
|
146
|
+
delete_research_repository(repository_name_or_path="repo_name")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Searching for GitHub Repositories
|
|
150
|
+
```
|
|
151
|
+
search_research_repository_suggestions(
|
|
152
|
+
keywords=["serverless", "lambda"],
|
|
153
|
+
num_results=10
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
Results are automatically filtered to AWS organizations (aws-samples, aws-solutions-library-samples, awslabs) and specific licenses (Apache License 2.0, MIT, MIT No Attribution), and sorted by stars (descending) and then by updated date.
|
|
157
|
+
""",
|
|
158
|
+
dependencies=[
|
|
159
|
+
'boto3',
|
|
160
|
+
'faiss-cpu',
|
|
161
|
+
'gitpython',
|
|
162
|
+
'loguru',
|
|
163
|
+
'numpy',
|
|
164
|
+
'pydantic',
|
|
165
|
+
],
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@mcp.tool(name='create_research_repository')
|
|
170
|
+
async def mcp_index_repository(
|
|
171
|
+
ctx: Context,
|
|
172
|
+
repository_path: str = Field(
|
|
173
|
+
description='Path to local repository or URL to remote repository'
|
|
174
|
+
),
|
|
175
|
+
output_path: Optional[str] = Field(
|
|
176
|
+
default=None,
|
|
177
|
+
description='Where to store the index (optional, uses default if not provided)',
|
|
178
|
+
),
|
|
179
|
+
embedding_model: str = Field(
|
|
180
|
+
default=EmbeddingModel.AMAZON_TITAN_EMBED_TEXT_V2,
|
|
181
|
+
description='Which AWS embedding model to use',
|
|
182
|
+
),
|
|
183
|
+
include_patterns: Optional[List[str]] = Field(
|
|
184
|
+
default=Constants.DEFAULT_INCLUDE_PATTERNS,
|
|
185
|
+
description='Glob patterns for files to include (optional). Defaults to common source code and documentation files.',
|
|
186
|
+
),
|
|
187
|
+
exclude_patterns: Optional[List[str]] = Field(
|
|
188
|
+
default=Constants.DEFAULT_EXCLUDE_PATTERNS,
|
|
189
|
+
description='Glob patterns for files to exclude (optional). Defaults to common binary files, build artifacts, and VCS directories.',
|
|
190
|
+
),
|
|
191
|
+
chunk_size: int = Field(
|
|
192
|
+
default=1000,
|
|
193
|
+
description='Maximum size of each chunk in characters',
|
|
194
|
+
),
|
|
195
|
+
chunk_overlap: int = Field(
|
|
196
|
+
default=200,
|
|
197
|
+
description='Overlap between chunks in characters',
|
|
198
|
+
),
|
|
199
|
+
) -> Dict:
|
|
200
|
+
"""Build a FAISS index for a Git repository.
|
|
201
|
+
|
|
202
|
+
This tool indexes a Git repository (local or remote) using FAISS and Amazon Bedrock embeddings.
|
|
203
|
+
The index can then be used for semantic search within the repository.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
ctx: MCP context object used for progress tracking and error reporting
|
|
207
|
+
repository_path: Path to local repository or URL to remote repository
|
|
208
|
+
output_path: Where to store the index (optional, uses default if not provided)
|
|
209
|
+
embedding_model: Which AWS embedding model to use
|
|
210
|
+
include_patterns: Glob patterns for files to include (optional)
|
|
211
|
+
exclude_patterns: Glob patterns for files to exclude (optional)
|
|
212
|
+
chunk_size: Maximum size of each chunk in characters
|
|
213
|
+
chunk_overlap: Overlap between chunks in characters
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Information about the created index
|
|
217
|
+
"""
|
|
218
|
+
logger.info(f'Indexing repository: {repository_path}')
|
|
219
|
+
|
|
220
|
+
# If output_path is provided and contains slashes, normalize it for file path compatibility
|
|
221
|
+
if output_path and '/' in output_path:
|
|
222
|
+
output_path = output_path.replace('/', '_')
|
|
223
|
+
logger.info(f'Normalized output path: {output_path}')
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
# Get AWS credentials from environment variables
|
|
227
|
+
aws_region = os.environ.get('AWS_REGION')
|
|
228
|
+
aws_profile = os.environ.get('AWS_PROFILE')
|
|
229
|
+
|
|
230
|
+
index_config = IndexConfig(
|
|
231
|
+
embedding_model=embedding_model, aws_region=aws_region, aws_profile=aws_profile
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
repository_config = RepositoryConfig(
|
|
235
|
+
repository_path=repository_path,
|
|
236
|
+
output_path=output_path,
|
|
237
|
+
include_patterns=include_patterns,
|
|
238
|
+
exclude_patterns=exclude_patterns,
|
|
239
|
+
chunk_size=chunk_size,
|
|
240
|
+
chunk_overlap=chunk_overlap,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Get the repository indexer
|
|
244
|
+
indexer = get_repository_indexer(config=index_config)
|
|
245
|
+
|
|
246
|
+
# Index the repository
|
|
247
|
+
response = await indexer.index_repository(
|
|
248
|
+
config=repository_config,
|
|
249
|
+
ctx=ctx, # Pass the context for progress tracking
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Add repository directory information to the response
|
|
253
|
+
if response.status == 'success':
|
|
254
|
+
repo_files_path = os.path.join(response.index_path, 'repository')
|
|
255
|
+
if os.path.exists(repo_files_path) and os.path.isdir(repo_files_path):
|
|
256
|
+
response.repository_directory = repo_files_path
|
|
257
|
+
|
|
258
|
+
# Return the response
|
|
259
|
+
return response.model_dump()
|
|
260
|
+
except Exception as e:
|
|
261
|
+
logger.error(f'Error indexing repository: {e}')
|
|
262
|
+
await ctx.error(f'Error indexing repository: {str(e)}')
|
|
263
|
+
raise
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
@mcp.resource(
|
|
267
|
+
uri='repositories://{repository_name}/summary',
|
|
268
|
+
name='Repository Summary',
|
|
269
|
+
mime_type='application/json',
|
|
270
|
+
)
|
|
271
|
+
async def repository_summary(repository_name: str) -> str:
|
|
272
|
+
"""Get a summary of an indexed repository including structure and helpful files.
|
|
273
|
+
|
|
274
|
+
This resource provides a summary of the repository including:
|
|
275
|
+
- Directory tree structure of all files
|
|
276
|
+
- List of helpful files (READMEs, documentation, etc.)
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
repository_name: Name of the repository
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Repository summary if repository is found, error message otherwise
|
|
283
|
+
"""
|
|
284
|
+
# Use repository_name as is for the response
|
|
285
|
+
full_repository_name = repository_name
|
|
286
|
+
logger.info(f'Listing files for repository: {full_repository_name}')
|
|
287
|
+
|
|
288
|
+
# Convert repository name with slashes to underscores for file path compatibility
|
|
289
|
+
normalized_repo_name = full_repository_name.replace('/', '_')
|
|
290
|
+
logger.info(f'Normalized repository name: {normalized_repo_name}')
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
# Get AWS credentials from environment variables
|
|
294
|
+
aws_region = os.environ.get('AWS_REGION')
|
|
295
|
+
aws_profile = os.environ.get('AWS_PROFILE')
|
|
296
|
+
|
|
297
|
+
# Get the repository searcher
|
|
298
|
+
searcher = get_repository_searcher(
|
|
299
|
+
aws_region=aws_region,
|
|
300
|
+
aws_profile=aws_profile,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# List the repository files
|
|
304
|
+
tree = searcher.list_repository_files(
|
|
305
|
+
repository_name=normalized_repo_name,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if tree is None:
|
|
309
|
+
return json.dumps(
|
|
310
|
+
{
|
|
311
|
+
'status': 'error',
|
|
312
|
+
'message': f'Repository not found or no files available: {repository_name}',
|
|
313
|
+
}
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Get the repository directory path
|
|
317
|
+
index_path = searcher.repository_indexer._get_index_path(normalized_repo_name)
|
|
318
|
+
repo_files_path = os.path.join(index_path, 'repository')
|
|
319
|
+
|
|
320
|
+
# Find helpful files (READMEs, etc.)
|
|
321
|
+
helpful_files = []
|
|
322
|
+
if tree and isinstance(tree, dict):
|
|
323
|
+
# Extract all README files from the tree
|
|
324
|
+
def extract_readme_paths(tree_dict, current_path=''):
|
|
325
|
+
readme_paths = []
|
|
326
|
+
for name, content in tree_dict.items():
|
|
327
|
+
path = f'{current_path}/{name}' if current_path else name
|
|
328
|
+
if isinstance(content, dict):
|
|
329
|
+
# It's a directory
|
|
330
|
+
readme_paths.extend(extract_readme_paths(content, path))
|
|
331
|
+
elif name.lower().startswith('readme'):
|
|
332
|
+
# It's a README file
|
|
333
|
+
# Format the path for use with access_file tool
|
|
334
|
+
file_path = f'{repository_name}/{path}'
|
|
335
|
+
readme_paths.append(file_path)
|
|
336
|
+
return readme_paths
|
|
337
|
+
|
|
338
|
+
helpful_files = extract_readme_paths(tree)
|
|
339
|
+
elif tree and isinstance(tree, str):
|
|
340
|
+
# If tree is a string, try to parse it as a directory structure
|
|
341
|
+
logger.info('Tree is a string, attempting to parse directory structure')
|
|
342
|
+
|
|
343
|
+
# Extract README files with their full paths from the string representation of the tree
|
|
344
|
+
import re
|
|
345
|
+
|
|
346
|
+
# Parse the tree structure to extract full paths
|
|
347
|
+
lines = tree.split('\n')
|
|
348
|
+
current_path = []
|
|
349
|
+
readme_files = []
|
|
350
|
+
|
|
351
|
+
# Process each line to build the directory structure
|
|
352
|
+
for line in lines:
|
|
353
|
+
# Skip empty lines
|
|
354
|
+
if not line.strip():
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
# Calculate the indentation level
|
|
358
|
+
indent = 0
|
|
359
|
+
for char in line:
|
|
360
|
+
if char in ' │':
|
|
361
|
+
indent += 1
|
|
362
|
+
else:
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
# Adjust the current path based on indentation
|
|
366
|
+
current_path = current_path[: indent // 4 + 1]
|
|
367
|
+
|
|
368
|
+
# Extract the file or directory name
|
|
369
|
+
match = re.search(r'[─└├]─+\s+(.+)$', line)
|
|
370
|
+
if match:
|
|
371
|
+
name = match.group(1)
|
|
372
|
+
|
|
373
|
+
# If it's a directory, add it to the current path
|
|
374
|
+
if name.endswith('/'):
|
|
375
|
+
name = name.rstrip('/')
|
|
376
|
+
if len(current_path) <= indent // 4:
|
|
377
|
+
current_path.append(name)
|
|
378
|
+
else:
|
|
379
|
+
current_path[indent // 4] = name
|
|
380
|
+
# If it's a README file, add its full path to the list
|
|
381
|
+
elif re.match(r'README.*', name, re.IGNORECASE):
|
|
382
|
+
path = '/'.join(current_path + [name]) if current_path else name
|
|
383
|
+
# Format the path for use with access_file tool
|
|
384
|
+
file_path = f'{repository_name}/{path}'
|
|
385
|
+
readme_files.append(file_path)
|
|
386
|
+
|
|
387
|
+
# Add all found README files to helpful_files
|
|
388
|
+
if readme_files:
|
|
389
|
+
helpful_files = readme_files
|
|
390
|
+
logger.info(
|
|
391
|
+
f'Found {len(helpful_files)} README files with full paths in string tree'
|
|
392
|
+
)
|
|
393
|
+
else:
|
|
394
|
+
logger.warning('No README files found in string tree')
|
|
395
|
+
|
|
396
|
+
return json.dumps(
|
|
397
|
+
{
|
|
398
|
+
'status': 'success',
|
|
399
|
+
'tree': tree,
|
|
400
|
+
'repository_name': repository_name,
|
|
401
|
+
'repository_directory': (
|
|
402
|
+
repo_files_path
|
|
403
|
+
if os.path.exists(repo_files_path) and os.path.isdir(repo_files_path)
|
|
404
|
+
else None
|
|
405
|
+
),
|
|
406
|
+
'helpful_files': helpful_files,
|
|
407
|
+
},
|
|
408
|
+
cls=DateTimeEncoder,
|
|
409
|
+
)
|
|
410
|
+
except Exception as e:
|
|
411
|
+
logger.error(f'Error listing repository files: {e}')
|
|
412
|
+
return json.dumps(
|
|
413
|
+
{'status': 'error', 'message': f'Error listing repository files: {str(e)}'},
|
|
414
|
+
cls=DateTimeEncoder,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
@mcp.resource(uri='repositories://', name='Indexed Repositories', mime_type='application/json')
|
|
419
|
+
async def list_repositories() -> str:
|
|
420
|
+
"""List all indexed repositories with detailed information.
|
|
421
|
+
|
|
422
|
+
This resource returns a list of all repositories that have been indexed and are available for searching.
|
|
423
|
+
It provides detailed information about each index including file counts, chunk counts, file types, etc.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
List of indexed repositories with detailed information
|
|
427
|
+
"""
|
|
428
|
+
logger.info('Listing indexed repositories')
|
|
429
|
+
|
|
430
|
+
try:
|
|
431
|
+
# List indexed repositories with detailed information by default
|
|
432
|
+
response = list_indexed_repositories(
|
|
433
|
+
index_dir=None,
|
|
434
|
+
detailed=True, # Return detailed information by default
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
# Add repository directory information to each repository
|
|
438
|
+
for repo in response.repositories:
|
|
439
|
+
repo_files_path = os.path.join(repo.index_path, 'repository')
|
|
440
|
+
if os.path.exists(repo_files_path) and os.path.isdir(repo_files_path):
|
|
441
|
+
repo.repository_directory = repo_files_path
|
|
442
|
+
|
|
443
|
+
# Return the response with custom encoder for datetime objects
|
|
444
|
+
return json.dumps(response.model_dump(), cls=DateTimeEncoder)
|
|
445
|
+
except Exception as e:
|
|
446
|
+
logger.error(f'Error listing indexed repositories: {e}')
|
|
447
|
+
return json.dumps(
|
|
448
|
+
{
|
|
449
|
+
'status': 'error',
|
|
450
|
+
'message': f'Error listing indexed repositories: {str(e)}',
|
|
451
|
+
}
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
async def access_file_or_directory(filepath: str) -> Union[str, List[str], Image]:
|
|
456
|
+
"""Access file or directory contents.
|
|
457
|
+
|
|
458
|
+
This resource provides access to file or directory contents:
|
|
459
|
+
- If the filepath references a text file, returns the content as a string
|
|
460
|
+
- If the filepath references a directory, returns an array of files in the directory
|
|
461
|
+
- If the filepath references a binary image (jpg, png), returns the image data
|
|
462
|
+
|
|
463
|
+
For repository files, use the format: repository_name/repository/path/to/file
|
|
464
|
+
Example: awslabs_mcp/repository/README.md
|
|
465
|
+
|
|
466
|
+
For repositories with organization names, both formats are supported:
|
|
467
|
+
- awslabs_mcp/repository/README.md (with underscore)
|
|
468
|
+
- awslabs/mcp/repository/README.md (with slash)
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
filepath: Path to the file or directory to access
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
File content, directory listing, or image data
|
|
475
|
+
"""
|
|
476
|
+
logger.info(f'Accessing file or directory: {filepath}')
|
|
477
|
+
|
|
478
|
+
try:
|
|
479
|
+
# Check if this is a repository file path (format: repo_name/repository/...)
|
|
480
|
+
parts = filepath.split('/')
|
|
481
|
+
|
|
482
|
+
# Handle the case where the first part might contain a slash (e.g., "awslabs/mcp")
|
|
483
|
+
if '/' in parts[0]:
|
|
484
|
+
# Normalize the repository name by replacing slashes with underscores
|
|
485
|
+
normalized_repo_name = parts[0].replace('/', '_')
|
|
486
|
+
# Reconstruct the path with the normalized repository name
|
|
487
|
+
parts[0] = normalized_repo_name
|
|
488
|
+
filepath = '/'.join(parts)
|
|
489
|
+
logger.info(f'Normalized filepath: {filepath}')
|
|
490
|
+
|
|
491
|
+
# Re-split the filepath with the normalized repository name
|
|
492
|
+
parts = filepath.split('/')
|
|
493
|
+
|
|
494
|
+
if len(parts) >= 2 and parts[1] == 'repository':
|
|
495
|
+
repo_name = parts[0]
|
|
496
|
+
# Get the repository directory path
|
|
497
|
+
try:
|
|
498
|
+
# Get AWS credentials from environment variables
|
|
499
|
+
aws_region = os.environ.get('AWS_REGION')
|
|
500
|
+
aws_profile = os.environ.get('AWS_PROFILE')
|
|
501
|
+
|
|
502
|
+
# Get the repository searcher
|
|
503
|
+
searcher = get_repository_searcher(
|
|
504
|
+
aws_region=aws_region,
|
|
505
|
+
aws_profile=aws_profile,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Get the repository directory path
|
|
509
|
+
index_path = searcher.repository_indexer._get_index_path(repo_name)
|
|
510
|
+
repo_path = os.path.join(index_path, 'repository')
|
|
511
|
+
|
|
512
|
+
# Construct the full path to the file
|
|
513
|
+
if len(parts) > 2:
|
|
514
|
+
file_path = os.path.join(repo_path, *parts[2:])
|
|
515
|
+
else:
|
|
516
|
+
file_path = repo_path
|
|
517
|
+
|
|
518
|
+
logger.info(f'Accessing repository file: {file_path}')
|
|
519
|
+
filepath = file_path
|
|
520
|
+
except Exception as e:
|
|
521
|
+
logger.error(f'Error resolving repository path: {e}')
|
|
522
|
+
return json.dumps(
|
|
523
|
+
{
|
|
524
|
+
'status': 'error',
|
|
525
|
+
'message': f'Error resolving repository path: {str(e)}',
|
|
526
|
+
}
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
# Check if the path exists
|
|
530
|
+
if not os.path.exists(filepath):
|
|
531
|
+
return json.dumps(
|
|
532
|
+
{
|
|
533
|
+
'status': 'error',
|
|
534
|
+
'message': f'File or directory not found: {filepath}',
|
|
535
|
+
}
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# If it's a directory, return a listing of files
|
|
539
|
+
if os.path.isdir(filepath):
|
|
540
|
+
files = os.listdir(filepath)
|
|
541
|
+
return json.dumps(
|
|
542
|
+
{
|
|
543
|
+
'status': 'success',
|
|
544
|
+
'type': 'directory',
|
|
545
|
+
'path': filepath,
|
|
546
|
+
'files': files,
|
|
547
|
+
}
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# If it's a file, determine the mime type
|
|
551
|
+
mime_type, _ = mimetypes.guess_type(filepath)
|
|
552
|
+
|
|
553
|
+
# If it's an image, return the image data
|
|
554
|
+
if mime_type and mime_type.startswith('image/'):
|
|
555
|
+
try:
|
|
556
|
+
# Read file directly as binary data
|
|
557
|
+
with open(filepath, 'rb') as f:
|
|
558
|
+
image_data = f.read()
|
|
559
|
+
|
|
560
|
+
# Extract format from mime_type (e.g., "image/png" -> "png")
|
|
561
|
+
image_format = mime_type.split('/')[1]
|
|
562
|
+
|
|
563
|
+
# Return Image with binary data
|
|
564
|
+
return Image(data=image_data, format=image_format)
|
|
565
|
+
except Exception as e:
|
|
566
|
+
logger.error(f'Error processing image file: {e}')
|
|
567
|
+
return json.dumps(
|
|
568
|
+
{
|
|
569
|
+
'status': 'error',
|
|
570
|
+
'message': f'Error processing image file: {str(e)}',
|
|
571
|
+
}
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
# For text files, return the content as a string
|
|
575
|
+
try:
|
|
576
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
|
577
|
+
content = f.read()
|
|
578
|
+
return content
|
|
579
|
+
except UnicodeDecodeError:
|
|
580
|
+
# If we can't decode as text, it's likely a binary file
|
|
581
|
+
return json.dumps(
|
|
582
|
+
{
|
|
583
|
+
'status': 'error',
|
|
584
|
+
'message': f'File appears to be binary and not an image: {filepath}',
|
|
585
|
+
}
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.error(f'Error accessing file or directory: {e}')
|
|
590
|
+
return json.dumps(
|
|
591
|
+
{
|
|
592
|
+
'status': 'error',
|
|
593
|
+
'message': f'Error accessing file or directory: {str(e)}',
|
|
594
|
+
}
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
@mcp.tool(name='search_research_repository')
|
|
599
|
+
async def mcp_search_repository(
|
|
600
|
+
ctx: Context,
|
|
601
|
+
index_path: str = Field(description='Name of the repository or path to the index to search'),
|
|
602
|
+
query: str = Field(description='The search query to use for semantic search'),
|
|
603
|
+
limit: int = Field(default=10, description='Maximum number of results to return'),
|
|
604
|
+
threshold: float = Field(
|
|
605
|
+
default=0.0, description='Minimum similarity score threshold (0.0 to 1.0)'
|
|
606
|
+
),
|
|
607
|
+
) -> Dict:
|
|
608
|
+
"""Perform semantic search within an indexed repository.
|
|
609
|
+
|
|
610
|
+
This tool searches an indexed repository using semantic search with Amazon Bedrock embeddings.
|
|
611
|
+
It returns results ranked by relevance to the query.
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
ctx: MCP context object used for error reporting
|
|
615
|
+
index_path: Name of the repository or path to the index to search
|
|
616
|
+
query: The search query to use for semantic search
|
|
617
|
+
limit: Maximum number of results to return
|
|
618
|
+
threshold: Minimum similarity score threshold (0.0 to 1.0)
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
Search results ranked by relevance to the query
|
|
622
|
+
"""
|
|
623
|
+
logger.info(f'Searching repository: {index_path} for query: {query}')
|
|
624
|
+
|
|
625
|
+
# Convert repository name with slashes to underscores for file path compatibility
|
|
626
|
+
normalized_index_path = str(index_path).replace('/', '_')
|
|
627
|
+
if normalized_index_path != index_path:
|
|
628
|
+
logger.info(f'Normalized index path: {normalized_index_path}')
|
|
629
|
+
|
|
630
|
+
try:
|
|
631
|
+
# Record start time
|
|
632
|
+
start_time = datetime.now()
|
|
633
|
+
|
|
634
|
+
# Get AWS credentials from environment variables
|
|
635
|
+
aws_region = os.environ.get('AWS_REGION')
|
|
636
|
+
aws_profile = os.environ.get('AWS_PROFILE')
|
|
637
|
+
|
|
638
|
+
# Get the repository searcher
|
|
639
|
+
searcher = get_repository_searcher(
|
|
640
|
+
aws_region=aws_region,
|
|
641
|
+
aws_profile=aws_profile,
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
# Search the repository
|
|
645
|
+
response = searcher.search(
|
|
646
|
+
index_path=normalized_index_path,
|
|
647
|
+
query=query,
|
|
648
|
+
limit=limit,
|
|
649
|
+
threshold=threshold,
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
# Calculate execution time
|
|
653
|
+
execution_time_ms = (datetime.now() - start_time).total_seconds() * 1000
|
|
654
|
+
|
|
655
|
+
# Add execution time to the response
|
|
656
|
+
response_dict = response.model_dump()
|
|
657
|
+
response_dict['execution_time_ms'] = execution_time_ms
|
|
658
|
+
|
|
659
|
+
# Return the response
|
|
660
|
+
return response_dict
|
|
661
|
+
except Exception as e:
|
|
662
|
+
logger.error(f'Error searching repository: {e}')
|
|
663
|
+
await ctx.error(f'Error searching repository: {str(e)}')
|
|
664
|
+
raise
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
@mcp.tool(name='search_research_repository_suggestions')
|
|
668
|
+
async def mcp_search_github_repos(
|
|
669
|
+
ctx: Context,
|
|
670
|
+
keywords: List[str] = Field(description='List of keywords to search for GitHub repositories'),
|
|
671
|
+
num_results: int = Field(default=5, description='Number of results to return'),
|
|
672
|
+
) -> Dict:
|
|
673
|
+
"""Search for GitHub repositories based on keywords, scoped to specific organizations.
|
|
674
|
+
|
|
675
|
+
This tool searches for GitHub repositories using the GitHub REST/GraphQL APIs, scoped to specific GitHub
|
|
676
|
+
organizations (aws-samples, aws-solutions-library-samples, and awslabs).
|
|
677
|
+
|
|
678
|
+
Results are filtered to only include repositories with specific licenses (Apache License 2.0,
|
|
679
|
+
MIT, and MIT No Attribution) and are sorted by stars (descending) and then by updated date.
|
|
680
|
+
|
|
681
|
+
For higher rate limits, you can set the GITHUB_TOKEN environment variable with a GitHub
|
|
682
|
+
personal access token. Without a token, the API is limited to 60 requests per hour, and requests are
|
|
683
|
+
made with the REST API. With a token, this increases to 5,000 requests per hour, and requests are made
|
|
684
|
+
with the GraphQL API.
|
|
685
|
+
|
|
686
|
+
Args:
|
|
687
|
+
ctx: MCP context object used for error reporting
|
|
688
|
+
keywords: List of keywords to search for
|
|
689
|
+
num_results: Number of results to return
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
List of GitHub repositories matching the search criteria
|
|
693
|
+
"""
|
|
694
|
+
logger.info(f'Searching for GitHub repositories with keywords: {keywords}')
|
|
695
|
+
|
|
696
|
+
try:
|
|
697
|
+
# Record start time
|
|
698
|
+
start_time = datetime.now()
|
|
699
|
+
|
|
700
|
+
# Get GitHub token from environment variables
|
|
701
|
+
github_token = os.environ.get('GITHUB_TOKEN')
|
|
702
|
+
|
|
703
|
+
# Log whether we're using authenticated or unauthenticated mode
|
|
704
|
+
if github_token:
|
|
705
|
+
logger.info('Using authenticated GitHub API (higher rate limits)')
|
|
706
|
+
else:
|
|
707
|
+
logger.info('Using unauthenticated GitHub API (lower rate limits)')
|
|
708
|
+
|
|
709
|
+
# Define fixed values for organizations and license filters
|
|
710
|
+
organizations = ['aws-samples', 'aws-solutions-library-samples', 'awslabs']
|
|
711
|
+
license_filter = ['Apache License 2.0', 'MIT', 'MIT No Attribution']
|
|
712
|
+
|
|
713
|
+
# Call the search function
|
|
714
|
+
results = github_repo_search_wrapper(
|
|
715
|
+
keywords=keywords,
|
|
716
|
+
organizations=organizations,
|
|
717
|
+
num_results=num_results,
|
|
718
|
+
license_filter=license_filter,
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
# Calculate execution time
|
|
722
|
+
execution_time_ms = (datetime.now() - start_time).total_seconds() * 1000
|
|
723
|
+
|
|
724
|
+
# Convert results to GitHubRepoSearchResult objects
|
|
725
|
+
repo_results = []
|
|
726
|
+
for result in results:
|
|
727
|
+
# Include all available fields
|
|
728
|
+
repo_results.append(
|
|
729
|
+
GitHubRepoSearchResult(
|
|
730
|
+
url=result['url'],
|
|
731
|
+
title=result['title'],
|
|
732
|
+
description=result.get('description'),
|
|
733
|
+
organization=result['organization'],
|
|
734
|
+
stars=result.get('stars'),
|
|
735
|
+
updated_at=result.get('updated_at'),
|
|
736
|
+
language=result.get('language'),
|
|
737
|
+
topics=result.get('topics'),
|
|
738
|
+
license=result.get('license'),
|
|
739
|
+
forks=result.get('forks'),
|
|
740
|
+
open_issues=result.get('open_issues'),
|
|
741
|
+
homepage=result.get('homepage'),
|
|
742
|
+
)
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
# Create response object
|
|
746
|
+
response = GitHubRepoSearchResponse(
|
|
747
|
+
status='success',
|
|
748
|
+
query=' '.join(keywords) if isinstance(keywords, list) else keywords,
|
|
749
|
+
organizations=organizations, # Using the organizations defined above
|
|
750
|
+
results=repo_results,
|
|
751
|
+
total_results=len(repo_results),
|
|
752
|
+
execution_time_ms=execution_time_ms,
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
# Return the response
|
|
756
|
+
return response.model_dump()
|
|
757
|
+
except Exception as e:
|
|
758
|
+
logger.error(f'Error searching for GitHub repositories: {e}')
|
|
759
|
+
await ctx.error(f'Error searching for GitHub repositories: {str(e)}')
|
|
760
|
+
raise
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
@mcp.tool(name='access_file')
|
|
764
|
+
async def mcp_access_file(
|
|
765
|
+
ctx: Context,
|
|
766
|
+
filepath: str = Field(description='Path to the file or directory to access'),
|
|
767
|
+
) -> Dict | ImageContent:
|
|
768
|
+
"""Access file or directory contents.
|
|
769
|
+
|
|
770
|
+
This tool provides access to file or directory contents:
|
|
771
|
+
- If the filepath references a text file, returns the content as a string
|
|
772
|
+
- If the filepath references a directory, returns an array of files in the directory
|
|
773
|
+
- If the filepath references a binary image (jpg, png), returns the image data
|
|
774
|
+
|
|
775
|
+
For repository files, use the format: repository_name/repository/path/to/file
|
|
776
|
+
Example: awslabs_mcp/repository/README.md
|
|
777
|
+
|
|
778
|
+
For repositories with organization names, both formats are supported:
|
|
779
|
+
- awslabs_mcp/repository/README.md (with underscore)
|
|
780
|
+
- awslabs/mcp/repository/README.md (with slash)
|
|
781
|
+
|
|
782
|
+
Args:
|
|
783
|
+
ctx: MCP context object used for error reporting
|
|
784
|
+
filepath: Path to the file or directory to access
|
|
785
|
+
|
|
786
|
+
Returns:
|
|
787
|
+
File content, directory listing, or image data
|
|
788
|
+
"""
|
|
789
|
+
logger.info(f'Tool: Accessing file or directory: {filepath}')
|
|
790
|
+
|
|
791
|
+
try:
|
|
792
|
+
# Use the existing access_file_or_directory function
|
|
793
|
+
result = await access_file_or_directory(filepath)
|
|
794
|
+
|
|
795
|
+
# Handle different result types
|
|
796
|
+
if isinstance(result, str):
|
|
797
|
+
if result.startswith('{'):
|
|
798
|
+
# It's a JSON string (error or directory listing)
|
|
799
|
+
return json.loads(result)
|
|
800
|
+
else:
|
|
801
|
+
# It's a file content string
|
|
802
|
+
return {'status': 'success', 'type': 'text', 'content': result}
|
|
803
|
+
elif isinstance(result, Image):
|
|
804
|
+
# It's an image
|
|
805
|
+
return result.to_image_content()
|
|
806
|
+
else:
|
|
807
|
+
# Unknown type
|
|
808
|
+
return {
|
|
809
|
+
'status': 'error',
|
|
810
|
+
'message': f'Unknown result type: {type(result)}',
|
|
811
|
+
}
|
|
812
|
+
except Exception as e:
|
|
813
|
+
# Ensure exceptions are properly raised for the test case
|
|
814
|
+
logger.error(f'Error in mcp_access_file: {e}')
|
|
815
|
+
await ctx.error(f'Error accessing file or directory: {str(e)}')
|
|
816
|
+
raise Exception(f'Error accessing file: {str(e)}')
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
@mcp.tool(name='delete_research_repository')
|
|
820
|
+
async def mcp_delete_repository(
|
|
821
|
+
ctx: Context,
|
|
822
|
+
repository_name_or_path: str = Field(
|
|
823
|
+
description='Name of the repository or path to the index to delete'
|
|
824
|
+
),
|
|
825
|
+
index_directory: Optional[str] = Field(
|
|
826
|
+
default=None,
|
|
827
|
+
description='Directory to look for indices (optional, uses default if not provided)',
|
|
828
|
+
),
|
|
829
|
+
) -> Dict:
|
|
830
|
+
"""Delete an indexed repository.
|
|
831
|
+
|
|
832
|
+
This tool deletes an indexed repository and its associated files.
|
|
833
|
+
It can be identified by repository name or the full path to the index.
|
|
834
|
+
|
|
835
|
+
Args:
|
|
836
|
+
ctx: MCP context object used for error reporting
|
|
837
|
+
repository_name_or_path: Name of the repository or path to the index to delete
|
|
838
|
+
index_directory: Directory to look for indices (optional, uses default if not provided)
|
|
839
|
+
|
|
840
|
+
Returns:
|
|
841
|
+
Status of the delete operation
|
|
842
|
+
"""
|
|
843
|
+
logger.info(f'Deleting repository: {repository_name_or_path}')
|
|
844
|
+
|
|
845
|
+
# Convert repository name with slashes to underscores for file path compatibility
|
|
846
|
+
normalized_repo_name = str(repository_name_or_path).replace('/', '_')
|
|
847
|
+
logger.info(f'Normalized repository name: {normalized_repo_name}')
|
|
848
|
+
|
|
849
|
+
# Properly await the info call
|
|
850
|
+
await ctx.info(f'Deleting repository: {normalized_repo_name}')
|
|
851
|
+
|
|
852
|
+
# Ensure index_directory is None or a string, not a Field
|
|
853
|
+
index_dir = None if index_directory is None else str(index_directory)
|
|
854
|
+
|
|
855
|
+
try:
|
|
856
|
+
# Record start time
|
|
857
|
+
start_time = datetime.now()
|
|
858
|
+
|
|
859
|
+
# Delete the repository
|
|
860
|
+
result = await delete_indexed_repository(
|
|
861
|
+
repository_name_or_path=normalized_repo_name,
|
|
862
|
+
index_dir=index_dir,
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
# Calculate execution time
|
|
866
|
+
execution_time_ms = (datetime.now() - start_time).total_seconds() * 1000
|
|
867
|
+
|
|
868
|
+
# Create response with all available fields
|
|
869
|
+
response_data = {
|
|
870
|
+
'status': result['status'],
|
|
871
|
+
'message': result['message'],
|
|
872
|
+
'repository_name': result.get('repository_name'),
|
|
873
|
+
'execution_time_ms': execution_time_ms,
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
# Add optional fields if they exist in the result
|
|
877
|
+
if 'deleted_files' in result:
|
|
878
|
+
response_data['deleted_files'] = result['deleted_files']
|
|
879
|
+
if 'errors' in result:
|
|
880
|
+
response_data['errors'] = result['errors']
|
|
881
|
+
if 'permission_issues' in result:
|
|
882
|
+
response_data['permission_issues'] = result['permission_issues']
|
|
883
|
+
|
|
884
|
+
# Create response object
|
|
885
|
+
response = DeleteRepositoryResponse(**response_data)
|
|
886
|
+
|
|
887
|
+
# Return the response
|
|
888
|
+
return response.model_dump()
|
|
889
|
+
except Exception as e:
|
|
890
|
+
logger.error(f'Error deleting repository: {e}')
|
|
891
|
+
await ctx.error(f'Error deleting repository: {str(e)}')
|
|
892
|
+
raise
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def main():
|
|
896
|
+
"""Run the MCP server with CLI argument support."""
|
|
897
|
+
parser = argparse.ArgumentParser(
|
|
898
|
+
description='An AWS Labs Model Context Protocol (MCP) server for researching git repositories'
|
|
899
|
+
)
|
|
900
|
+
parser.add_argument('--sse', action='store_true', help='Use SSE transport')
|
|
901
|
+
parser.add_argument('--port', type=int, default=8888, help='Port to run the server on')
|
|
902
|
+
|
|
903
|
+
args = parser.parse_args()
|
|
904
|
+
|
|
905
|
+
# Run server with appropriate transport
|
|
906
|
+
if args.sse:
|
|
907
|
+
mcp.settings.port = args.port
|
|
908
|
+
mcp.run(transport='sse')
|
|
909
|
+
else:
|
|
910
|
+
mcp.run()
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
if __name__ == '__main__':
|
|
914
|
+
main()
|