awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,291 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+ """Data models for Git Repository Research MCP Server."""
12
+
13
+ from datetime import datetime
14
+ from enum import Enum
15
+ from pydantic import BaseModel, Field
16
+ from typing import Dict, List, Optional
17
+
18
+
19
+ class GitHubConfig(BaseModel):
20
+ """GitHub API configuration.
21
+
22
+ This model defines the configuration for the GitHub API, including
23
+ the optional token for authentication and the API URL.
24
+ """
25
+
26
+ token: Optional[str] = Field(None, description='GitHub API token for increased rate limits')
27
+ api_url: str = Field(
28
+ default='https://api.github.com/graphql', description='GitHub GraphQL API URL'
29
+ )
30
+
31
+
32
+ class IndexMetadata(BaseModel):
33
+ """Metadata for a repository index.
34
+
35
+ This model stores information about an indexed repository, including
36
+ its location, creation time, and statistics about the indexed content.
37
+ """
38
+
39
+ repository_name: str = Field(..., description='Name of the repository')
40
+ repository_path: str = Field(..., description='Path or URL of the repository')
41
+ index_path: str = Field(..., description='Path to the index file')
42
+ created_at: datetime = Field(
43
+ default_factory=datetime.now, description='When the index was created'
44
+ )
45
+ last_accessed: Optional[datetime] = Field(None, description='When the index was last accessed')
46
+ file_count: int = Field(0, description='Number of files indexed')
47
+ chunk_count: int = Field(0, description='Number of text chunks indexed')
48
+ embedding_model: str = Field(..., description='Model used for embeddings')
49
+ file_types: Dict[str, int] = Field(
50
+ default_factory=dict, description='Count of file types indexed'
51
+ )
52
+ total_tokens: Optional[int] = Field(None, description='Total number of tokens processed')
53
+ index_size_bytes: Optional[int] = Field(None, description='Size of the index in bytes')
54
+ last_commit_id: Optional[str] = Field(
55
+ None, description='ID of the last commit in the repository'
56
+ )
57
+ repository_directory: Optional[str] = Field(
58
+ None, description='Path to the cloned repository directory'
59
+ )
60
+
61
+
62
+ class SearchResult(BaseModel):
63
+ """Result from a repository search.
64
+
65
+ This model represents a single search result, including the file path,
66
+ relevant content, and similarity score.
67
+ """
68
+
69
+ file_path: str = Field(..., description='Path to the file within the repository')
70
+ content: str = Field(..., description='Relevant content snippet')
71
+ score: float = Field(..., description='Similarity score (0-1)')
72
+ line_numbers: Optional[List[int]] = Field(None, description='Line numbers for the content')
73
+ metadata: Optional[Dict[str, str]] = Field(
74
+ None, description='Additional metadata about the result'
75
+ )
76
+
77
+
78
+ class SearchResponse(BaseModel):
79
+ """Response from a repository search.
80
+
81
+ This model represents the complete response from a search operation,
82
+ including all matching results and query metadata.
83
+ """
84
+
85
+ results: List[SearchResult] = Field(default_factory=list, description='Search results')
86
+ query: str = Field(..., description='Original search query')
87
+ index_path: str = Field(..., description='Path to the index that was searched')
88
+ repository_name: str = Field(..., description='Name of the repository')
89
+ repository_directory: Optional[str] = Field(
90
+ None, description='Path to the cloned repository directory'
91
+ )
92
+ timestamp: datetime = Field(
93
+ default_factory=datetime.now, description='When the search was performed'
94
+ )
95
+ total_results: int = Field(0, description='Total number of results found')
96
+ execution_time_ms: Optional[float] = Field(
97
+ None, description='Search execution time in milliseconds'
98
+ )
99
+
100
+
101
+ class IndexedRepositoryInfo(BaseModel):
102
+ """Information about an indexed repository.
103
+
104
+ This model provides a summary of an indexed repository for listing purposes.
105
+ """
106
+
107
+ repository_name: str = Field(..., description='Name of the repository')
108
+ repository_path: str = Field(..., description='Path or URL of the repository')
109
+ index_path: str = Field(..., description='Path to the index file')
110
+ repository_directory: Optional[str] = Field(
111
+ None, description='Path to the cloned repository directory'
112
+ )
113
+ created_at: datetime = Field(..., description='When the index was created')
114
+ last_accessed: Optional[datetime] = Field(None, description='When the index was last accessed')
115
+ file_count: int = Field(0, description='Number of files indexed')
116
+ embedding_model: str = Field(..., description='Model used for embeddings')
117
+
118
+
119
+ class IndexedRepositoriesResponse(BaseModel):
120
+ """Response containing a list of indexed repositories.
121
+
122
+ This model represents the complete response from a list operation,
123
+ including all indexed repositories and summary statistics.
124
+ """
125
+
126
+ repositories: List[IndexedRepositoryInfo] = Field(
127
+ default_factory=list, description='List of indexed repositories'
128
+ )
129
+ total_count: int = Field(0, description='Total number of indexed repositories')
130
+ index_directory: str = Field(..., description='Directory containing the indices')
131
+
132
+
133
+ class DetailedIndexedRepositoryInfo(IndexedRepositoryInfo):
134
+ """Detailed information about an indexed repository.
135
+
136
+ This model extends the basic repository info with additional details
137
+ about the indexed content.
138
+ """
139
+
140
+ chunk_count: int = Field(0, description='Number of text chunks indexed')
141
+ file_types: Dict[str, int] = Field(
142
+ default_factory=dict, description='Count of file types indexed'
143
+ )
144
+ total_tokens: Optional[int] = Field(None, description='Total number of tokens processed')
145
+ index_size_bytes: Optional[int] = Field(None, description='Size of the index in bytes')
146
+ last_commit_id: Optional[str] = Field(
147
+ None, description='ID of the last commit in the repository'
148
+ )
149
+
150
+
151
+ class DetailedIndexedRepositoriesResponse(BaseModel):
152
+ """Response containing detailed information about indexed repositories.
153
+
154
+ This model represents the complete response from a detailed list operation,
155
+ including all indexed repositories with detailed information.
156
+ """
157
+
158
+ repositories: List[DetailedIndexedRepositoryInfo] = Field(
159
+ default_factory=list,
160
+ description='List of indexed repositories with detailed information',
161
+ )
162
+ total_count: int = Field(0, description='Total number of indexed repositories')
163
+ index_directory: str = Field(..., description='Directory containing the indices')
164
+ total_index_size_bytes: Optional[int] = Field(
165
+ None, description='Total size of all indices in bytes'
166
+ )
167
+
168
+
169
+ class EmbeddingModel(str, Enum):
170
+ """Available embedding models.
171
+
172
+ This enum defines the available embedding models that can be used
173
+ for generating embeddings from repository content.
174
+ """
175
+
176
+ AMAZON_TITAN_EMBED_TEXT_V1 = 'amazon.titan-embed-text-v1'
177
+ AMAZON_TITAN_EMBED_TEXT_V2 = 'amazon.titan-embed-text-v2:0'
178
+ COHERE_EMBED_ENGLISH_V3 = 'cohere.embed-english-v3'
179
+ COHERE_EMBED_MULTILINGUAL_V3 = 'cohere.embed-multilingual-v3'
180
+
181
+
182
+ class IndexRepositoryResponse(BaseModel):
183
+ """Response from indexing a repository.
184
+
185
+ This model represents the complete response from an indexing operation,
186
+ including metadata about the created index.
187
+ """
188
+
189
+ status: str = Field(..., description='Status of the indexing operation')
190
+ repository_name: str = Field(..., description='Name of the repository')
191
+ repository_path: str = Field(..., description='Path or URL of the repository')
192
+ index_path: str = Field(..., description='Path to the created index')
193
+ repository_directory: Optional[str] = Field(
194
+ None, description='Path to the cloned repository directory'
195
+ )
196
+ file_count: int = Field(0, description='Number of files indexed')
197
+ chunk_count: int = Field(0, description='Number of text chunks indexed')
198
+ embedding_model: str = Field(..., description='Model used for embeddings')
199
+ execution_time_ms: Optional[float] = Field(
200
+ None, description='Indexing execution time in milliseconds'
201
+ )
202
+ message: Optional[str] = Field(
203
+ None, description='Additional information about the indexing operation'
204
+ )
205
+
206
+
207
+ class GitHubRepoSearchInput(BaseModel):
208
+ """Input for GitHub repository search.
209
+
210
+ This model defines the input parameters for searching GitHub repositories
211
+ based on keywords and organizations.
212
+ """
213
+
214
+ keywords: List[str] = Field(description='List of keywords to search for GitHub repositories')
215
+ organizations: Optional[List[str]] = Field(
216
+ default=['aws-samples', 'aws-solutions-library-samples', 'awslabs'],
217
+ description='List of GitHub organizations to scope the search to',
218
+ )
219
+ num_results: Optional[int] = Field(default=5, description='Number of results to return')
220
+ license_filter: Optional[List[str]] = Field(
221
+ default=None,
222
+ description="List of licenses to filter by (e.g., 'Apache License 2.0', 'MIT No Attribution')",
223
+ )
224
+
225
+
226
+ class GitHubRepoSearchResult(BaseModel):
227
+ """Result from a GitHub repository search.
228
+
229
+ This model represents a single GitHub repository search result.
230
+ """
231
+
232
+ url: str = Field(..., description='URL of the GitHub repository')
233
+ title: str = Field(..., description='Title of the repository')
234
+ description: Optional[str] = Field(None, description='Description of the repository')
235
+ organization: str = Field(..., description='GitHub organization that owns the repository')
236
+ stars: Optional[int] = Field(None, description='Number of stars the repository has')
237
+ updated_at: Optional[str] = Field(None, description='When the repository was last updated')
238
+ language: Optional[str] = Field(
239
+ None, description='Primary programming language of the repository'
240
+ )
241
+ topics: Optional[List[str]] = Field(
242
+ None, description='Topics/tags associated with the repository'
243
+ )
244
+ license: Optional[str] = Field(None, description='License of the repository')
245
+ forks: Optional[int] = Field(None, description='Number of forks the repository has')
246
+ open_issues: Optional[int] = Field(None, description='Number of open issues in the repository')
247
+ homepage: Optional[str] = Field(None, description='Homepage URL of the repository')
248
+
249
+
250
+ class GitHubRepoSearchResponse(BaseModel):
251
+ """Response from a GitHub repository search.
252
+
253
+ This model represents the complete response from a GitHub repository search.
254
+ """
255
+
256
+ status: str = Field(..., description='Status of the search operation')
257
+ query: str = Field(..., description='Original search query')
258
+ organizations: List[str] = Field(..., description='Organizations that were searched')
259
+ results: List[GitHubRepoSearchResult] = Field(
260
+ default_factory=list, description='Search results'
261
+ )
262
+ total_results: int = Field(0, description='Total number of results found')
263
+ execution_time_ms: Optional[float] = Field(
264
+ None, description='Search execution time in milliseconds'
265
+ )
266
+
267
+
268
+ class DeleteRepositoryResponse(BaseModel):
269
+ """Response from deleting a repository.
270
+
271
+ This model represents the complete response from a delete operation,
272
+ including status and information about the deleted repository.
273
+ """
274
+
275
+ status: str = Field(
276
+ ..., description='Status of the delete operation (success, partial, or error)'
277
+ )
278
+ message: str = Field(..., description='Information about the delete operation')
279
+ repository_name: Optional[str] = Field(None, description='Name of the deleted repository')
280
+ execution_time_ms: Optional[float] = Field(
281
+ None, description='Delete operation execution time in milliseconds'
282
+ )
283
+ deleted_files: Optional[List[str]] = Field(
284
+ None, description='List of files that were successfully deleted'
285
+ )
286
+ errors: Optional[List[str]] = Field(
287
+ None, description='List of errors encountered during deletion'
288
+ )
289
+ permission_issues: Optional[List[str]] = Field(
290
+ None, description='List of files with permission issues'
291
+ )
@@ -0,0 +1,321 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+ """Repository handling for Git Repository Research MCP Server.
12
+
13
+ This module provides functionality for cloning, accessing, and processing
14
+ Git repositories for indexing and searching.
15
+ """
16
+
17
+ import fnmatch
18
+ import os
19
+ import shutil
20
+ import tempfile
21
+ from awslabs.git_repo_research_mcp_server.defaults import Constants
22
+ from git import Repo
23
+ from loguru import logger
24
+ from typing import Dict, List, Optional, Tuple
25
+ from urllib.parse import urlparse
26
+
27
+
28
+ def is_git_url(repo_path: str) -> bool:
29
+ """Check if a string is a Git URL.
30
+
31
+ Args:
32
+ repo_path: Path or URL to check
33
+
34
+ Returns:
35
+ True if the string is a Git URL, False otherwise
36
+ """
37
+ parsed = urlparse(repo_path)
38
+ return parsed.scheme in ('http', 'https', 'git', 'ssh')
39
+
40
+
41
+ def is_git_repo(path: str) -> bool:
42
+ """Check if a path is a Git repository.
43
+
44
+ Args:
45
+ path: Path to check
46
+
47
+ Returns:
48
+ True if the path is a Git repository, False otherwise
49
+ """
50
+ try:
51
+ Repo(path)
52
+ return True
53
+ except Exception:
54
+ return False
55
+
56
+
57
+ def clone_repository(url: str, target_dir: Optional[str] = None) -> str:
58
+ """Clone a Git repository from a URL.
59
+
60
+ Args:
61
+ url: URL of the repository to clone
62
+ target_dir: Directory to clone into (optional, uses temp dir if not provided)
63
+
64
+ Returns:
65
+ Path to the cloned repository
66
+
67
+ Raises:
68
+ Exception: If cloning fails
69
+ """
70
+ if target_dir is None:
71
+ target_dir = tempfile.mkdtemp(prefix='git_repo_research_')
72
+
73
+ logger.info(f'Cloning repository from {url} to {target_dir}')
74
+ try:
75
+ # Clone the repository with GitPython
76
+ Repo.clone_from(url, target_dir)
77
+
78
+ # Check if .git directory exists after cloning
79
+ git_dir = os.path.join(target_dir, '.git')
80
+ if os.path.exists(git_dir):
81
+ logger.info(f'.git directory exists at {git_dir}')
82
+ else:
83
+ logger.warning(f'.git directory not found after cloning at {git_dir}')
84
+ # List the contents of the directory to debug
85
+ logger.info(f'Contents of {target_dir}: {os.listdir(target_dir)}')
86
+
87
+ return target_dir
88
+ except Exception as e:
89
+ # Clean up the target directory if it was created
90
+ if os.path.exists(target_dir):
91
+ shutil.rmtree(target_dir, ignore_errors=True)
92
+ logger.error(f'Failed to clone repository: {e}')
93
+ raise
94
+
95
+
96
+ def get_repository_name(repo_path: str) -> str:
97
+ """Get the name of a repository.
98
+
99
+ Args:
100
+ repo_path: Path or URL of the repository
101
+
102
+ Returns:
103
+ Name of the repository, including GitHub organization/username if available
104
+ Note: For GitHub repositories, the format is "org_repo" (with underscore)
105
+ instead of "org/repo" for file path compatibility
106
+ """
107
+ if is_git_url(repo_path):
108
+ # Extract the repository name from the URL
109
+ parsed = urlparse(repo_path)
110
+ path_parts = parsed.path.strip('/').split('/')
111
+
112
+ # Check if this is a GitHub URL with org/username
113
+ if parsed.netloc in ['github.com', 'www.github.com'] and len(path_parts) >= 2:
114
+ # Include the organization/username in the repository name
115
+ org_name = path_parts[-2]
116
+ repo_name = path_parts[-1]
117
+ if repo_name.endswith('.git'):
118
+ repo_name = repo_name[:-4]
119
+ # Use underscore instead of slash for file path compatibility
120
+ return f'{org_name}_{repo_name}'
121
+ else:
122
+ # For non-GitHub URLs or URLs without clear org structure,
123
+ # just use the last part of the path
124
+ repo_name = path_parts[-1]
125
+ if repo_name.endswith('.git'):
126
+ repo_name = repo_name[:-4]
127
+ return repo_name
128
+ else:
129
+ # Use the directory name as the repository name
130
+ return os.path.basename(os.path.abspath(repo_path))
131
+
132
+
133
+ def get_text_files(
134
+ repo_path: str,
135
+ include_patterns: Optional[List[str]] = None,
136
+ exclude_patterns: Optional[List[str]] = None,
137
+ ) -> List[str]:
138
+ """Get all text files in a repository.
139
+
140
+ Args:
141
+ repo_path: Path to the repository
142
+ include_patterns: Glob patterns for files to include (optional)
143
+ exclude_patterns: Glob patterns for files to exclude (optional)
144
+
145
+ Returns:
146
+ List of paths to text files
147
+ """
148
+ if include_patterns is None:
149
+ include_patterns = Constants.TEXT_FILE_INCLUDE_PATTERNS
150
+ if exclude_patterns is None:
151
+ exclude_patterns = Constants.TEXT_FILE_EXCLUDE_PATTERNS
152
+
153
+ text_files = []
154
+ for root, _, files in os.walk(repo_path):
155
+ for file in files:
156
+ file_path = os.path.join(root, file)
157
+ rel_path = os.path.relpath(file_path, repo_path)
158
+
159
+ # Check if the file matches any include pattern
160
+ included = any(fnmatch.fnmatch(rel_path, pattern) for pattern in include_patterns)
161
+ if not included:
162
+ continue
163
+
164
+ # Check if the file matches any exclude pattern
165
+ excluded = any(fnmatch.fnmatch(rel_path, pattern) for pattern in exclude_patterns)
166
+ if excluded:
167
+ continue
168
+
169
+ # Try to read the file as text
170
+ try:
171
+ with open(file_path, 'r', encoding='utf-8') as f:
172
+ # Read a small sample to check if it's text
173
+ sample = f.read(1024)
174
+ # If we can decode it as UTF-8, it's probably text
175
+ if sample:
176
+ text_files.append(file_path)
177
+ except UnicodeDecodeError:
178
+ # Not a text file
179
+ pass
180
+ except Exception as e:
181
+ logger.warning(f'Error reading file {file_path}: {e}')
182
+
183
+ return text_files
184
+
185
+
186
+ def get_file_extension_stats(file_paths: List[str]) -> Dict[str, int]:
187
+ """Get statistics about file extensions.
188
+
189
+ Args:
190
+ file_paths: List of file paths
191
+
192
+ Returns:
193
+ Dictionary mapping file extensions to counts
194
+ """
195
+ extension_counts = {}
196
+ for file_path in file_paths:
197
+ _, ext = os.path.splitext(file_path)
198
+ if ext:
199
+ # Remove the dot from the extension
200
+ ext = ext[1:].lower()
201
+ extension_counts[ext] = extension_counts.get(ext, 0) + 1
202
+ else:
203
+ extension_counts['no_extension'] = extension_counts.get('no_extension', 0) + 1
204
+ return extension_counts
205
+
206
+
207
+ def read_file_content(file_path: str) -> str:
208
+ """Read the content of a file.
209
+
210
+ Args:
211
+ file_path: Path to the file
212
+
213
+ Returns:
214
+ Content of the file as a string
215
+
216
+ Raises:
217
+ Exception: If reading fails
218
+ """
219
+ try:
220
+ with open(file_path, 'r', encoding='utf-8') as f:
221
+ return f.read()
222
+ except Exception as e:
223
+ logger.error(f'Failed to read file {file_path}: {e}')
224
+ raise
225
+
226
+
227
+ def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
228
+ """Split text into chunks.
229
+
230
+ Args:
231
+ text: Text to split
232
+ chunk_size: Maximum size of each chunk in characters
233
+ chunk_overlap: Overlap between chunks in characters
234
+
235
+ Returns:
236
+ List of text chunks
237
+ """
238
+ if not text or len(text) <= chunk_size:
239
+ return [text] if text else []
240
+
241
+ chunks = []
242
+ start = 0
243
+ while start < len(text):
244
+ end = start + chunk_size
245
+ if end >= len(text):
246
+ chunks.append(text[start:])
247
+ break
248
+
249
+ # Try to find a good breaking point (newline or space)
250
+ break_point = text.rfind('\n', start + chunk_size - chunk_overlap, end)
251
+ if break_point == -1:
252
+ break_point = text.rfind(' ', start + chunk_size - chunk_overlap, end)
253
+ if break_point == -1:
254
+ break_point = end
255
+
256
+ chunks.append(text[start:break_point])
257
+ start = break_point + 1 if text[break_point] in ['\n', ' '] else break_point
258
+
259
+ return chunks
260
+
261
+
262
+ def process_repository(
263
+ repo_path: str,
264
+ include_patterns: Optional[List[str]] = None,
265
+ exclude_patterns: Optional[List[str]] = None,
266
+ chunk_size: int = 1000,
267
+ chunk_overlap: int = 200,
268
+ ) -> Tuple[List[str], Dict[str, str], Dict[str, int]]:
269
+ """Process a repository for indexing.
270
+
271
+ Args:
272
+ repo_path: Path to the repository
273
+ include_patterns: Glob patterns for files to include (optional)
274
+ exclude_patterns: Glob patterns for files to exclude (optional)
275
+ chunk_size: Maximum size of each chunk in characters
276
+ chunk_overlap: Overlap between chunks in characters
277
+
278
+ Returns:
279
+ Tuple containing:
280
+ - List of text chunks
281
+ - Dictionary mapping chunks to file paths
282
+ - Dictionary of file extension statistics
283
+ """
284
+ logger.info(f'Processing repository at {repo_path}')
285
+ text_files = get_text_files(repo_path, include_patterns, exclude_patterns)
286
+ logger.info(f'Found {len(text_files)} text files')
287
+
288
+ extension_stats = get_file_extension_stats(text_files)
289
+ logger.info(f'File extension statistics: {extension_stats}')
290
+
291
+ chunks = []
292
+ chunk_to_file = {}
293
+
294
+ for file_path in text_files:
295
+ try:
296
+ content = read_file_content(file_path)
297
+ file_chunks = chunk_text(content, chunk_size, chunk_overlap)
298
+
299
+ rel_path = os.path.relpath(file_path, repo_path)
300
+ for chunk in file_chunks:
301
+ chunks.append(chunk)
302
+ chunk_to_file[chunk] = rel_path
303
+ except Exception as e:
304
+ logger.warning(f'Error processing file {file_path}: {e}')
305
+
306
+ logger.info(f'Created {len(chunks)} text chunks')
307
+ return chunks, chunk_to_file, extension_stats
308
+
309
+
310
+ def cleanup_repository(repo_path: str) -> None:
311
+ """Clean up a cloned repository.
312
+
313
+ Args:
314
+ repo_path: Path to the repository
315
+ """
316
+ if os.path.exists(repo_path) and os.path.isdir(repo_path):
317
+ logger.info(f'Cleaning up repository at {repo_path}')
318
+ try:
319
+ shutil.rmtree(repo_path, ignore_errors=True)
320
+ except Exception as e:
321
+ logger.warning(f'Error cleaning up repository: {e}')