awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,471 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+ """GitHub repository search functionality for Git Repository Research MCP Server.
12
+
13
+ This module provides functionality for searching GitHub repositories using the GitHub GraphQL API.
14
+ """
15
+
16
+ import backoff
17
+ import os
18
+ import requests
19
+ import time
20
+ from loguru import logger
21
+ from typing import Any, Dict, List, Optional
22
+
23
+
24
+ # GitHub GraphQL API query for repository search
25
+ GITHUB_GRAPHQL_QUERY = """
26
+ query SearchRepositories($query: String!, $numResults: Int!) {
27
+ search(query: $query, type: REPOSITORY, first: $numResults) {
28
+ repositoryCount
29
+ edges {
30
+ node {
31
+ ... on Repository {
32
+ nameWithOwner
33
+ name
34
+ owner {
35
+ login
36
+ }
37
+ url
38
+ description
39
+ stargazerCount
40
+ updatedAt
41
+ primaryLanguage {
42
+ name
43
+ }
44
+ repositoryTopics(first: 10) {
45
+ nodes {
46
+ topic {
47
+ name
48
+ }
49
+ }
50
+ }
51
+ licenseInfo {
52
+ name
53
+ }
54
+ forkCount
55
+ openIssues: issues(states: OPEN) {
56
+ totalCount
57
+ }
58
+ homepageUrl
59
+ }
60
+ }
61
+ }
62
+ }
63
+ }
64
+ """
65
+
66
+
67
+ @backoff.on_exception(
68
+ backoff.expo,
69
+ (requests.exceptions.RequestException, requests.exceptions.HTTPError),
70
+ max_tries=5,
71
+ giveup=lambda e: bool(
72
+ (response := getattr(e, 'response', None))
73
+ and getattr(response, 'status_code', None) == 401
74
+ ), # Don't retry on auth failures
75
+ )
76
+ def github_graphql_request(
77
+ query: str, variables: Dict[str, Any], token: Optional[str] = None
78
+ ) -> Dict[str, Any]:
79
+ """Make a request to the GitHub GraphQL API with exponential backoff for rate limiting.
80
+
81
+ Args:
82
+ query: The GraphQL query
83
+ variables: Variables for the GraphQL query
84
+ token: Optional GitHub token for authentication
85
+
86
+ Returns:
87
+ The JSON response from the API
88
+ """
89
+ headers = {
90
+ 'Content-Type': 'application/json',
91
+ }
92
+
93
+ # Add authorization header if token is provided
94
+ if token:
95
+ headers['Authorization'] = f'Bearer {token}'
96
+
97
+ try:
98
+ response = requests.post(
99
+ 'https://api.github.com/graphql',
100
+ headers=headers,
101
+ json={'query': query, 'variables': variables},
102
+ timeout=10, # Add 10 second timeout to prevent hanging requests
103
+ )
104
+
105
+ # Check for rate limiting
106
+ if response.status_code == 403 and 'rate limit' in response.text.lower():
107
+ # For unauthenticated requests, don't wait - just log and return empty response
108
+ if not token:
109
+ logger.warning(
110
+ 'Rate limited by GitHub API and no token provided. Consider adding a GITHUB_TOKEN.'
111
+ )
112
+ return {'data': {'search': {'edges': []}}}
113
+
114
+ # For authenticated requests, check reset time but cap at reasonable value
115
+ reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
116
+ current_time = int(time.time())
117
+ wait_time = min(max(reset_time - current_time, 0), 60) # Cap at 60 seconds
118
+
119
+ if wait_time > 0:
120
+ logger.warning(f'Rate limited by GitHub API. Waiting {wait_time} seconds.')
121
+ time.sleep(wait_time)
122
+ # Retry the request
123
+ return github_graphql_request(query, variables, token)
124
+
125
+ # Raise exception for other HTTP errors
126
+ response.raise_for_status()
127
+
128
+ return response.json()
129
+
130
+ except requests.exceptions.RequestException as e:
131
+ logger.error(f'GitHub API request error: {str(e)}')
132
+ raise
133
+
134
+
135
+ def github_repo_search_graphql(
136
+ keywords: List[str],
137
+ organizations: List[str],
138
+ num_results: int = 5,
139
+ token: Optional[str] = None,
140
+ license_filter: Optional[List[str]] = None,
141
+ ) -> List[Dict[str, Any]]:
142
+ """Search GitHub repositories using the GraphQL API.
143
+
144
+ Args:
145
+ keywords: List of keywords to search for
146
+ organizations: List of GitHub organizations to scope the search to
147
+ num_results: Number of results to return
148
+ token: Optional GitHub token for authentication
149
+ license_filter: Optional list of license names to filter repositories by
150
+
151
+ Returns:
152
+ List of GitHub repositories matching the search criteria
153
+ """
154
+ # Build the search query with organization filters
155
+ org_filters = ' '.join([f'org:{org}' for org in organizations])
156
+ keyword_string = ' OR '.join(keywords)
157
+ query_string = f'{keyword_string} {org_filters}'
158
+
159
+ logger.info(f'Searching GitHub with GraphQL query: {query_string}')
160
+
161
+ try:
162
+ # Make the GraphQL request
163
+ variables = {
164
+ 'query': query_string,
165
+ 'numResults': num_results * 2, # Request more than needed to filter
166
+ }
167
+
168
+ response = github_graphql_request(GITHUB_GRAPHQL_QUERY, variables, token)
169
+
170
+ if 'errors' in response:
171
+ error_messages = [
172
+ error.get('message', 'Unknown error') for error in response['errors']
173
+ ]
174
+ logger.error(f'GitHub GraphQL API errors: {", ".join(error_messages)}')
175
+ return []
176
+
177
+ # Extract repository data from response
178
+ search_data = response.get('data', {}).get('search', {})
179
+ edges = search_data.get('edges', [])
180
+
181
+ repo_results = []
182
+ processed_urls = set() # To avoid duplicates
183
+
184
+ for edge in edges:
185
+ node = edge.get('node', {})
186
+
187
+ # Extract repository information
188
+ repo_url = node.get('url', '')
189
+ name_with_owner = node.get('nameWithOwner', '')
190
+ description = node.get('description', '')
191
+ owner = node.get('owner', {}).get('login', '')
192
+
193
+ # Skip if we've already processed this URL or if it's not from one of our target organizations
194
+ if repo_url in processed_urls or owner.lower() not in [
195
+ org.lower() for org in organizations
196
+ ]:
197
+ continue
198
+
199
+ processed_urls.add(repo_url)
200
+
201
+ # Extract primary language if available
202
+ primary_language = node.get('primaryLanguage', {})
203
+ language = primary_language.get('name') if primary_language else None
204
+
205
+ # Extract topics if available
206
+ topics_data = node.get('repositoryTopics', {}).get('nodes', [])
207
+ topics = [
208
+ topic.get('topic', {}).get('name') for topic in topics_data if topic.get('topic')
209
+ ]
210
+
211
+ # Extract license information if available
212
+ license_info = node.get('licenseInfo', {})
213
+ license_name = license_info.get('name') if license_info else None
214
+
215
+ # Skip if license filter is specified and this repository's license doesn't match
216
+ if license_filter and license_name and license_name not in license_filter:
217
+ continue
218
+
219
+ # Extract open issues count
220
+ open_issues = node.get('openIssues', {}).get('totalCount', 0)
221
+
222
+ # Add to results with additional metadata
223
+ repo_results.append(
224
+ {
225
+ 'url': repo_url,
226
+ 'title': name_with_owner,
227
+ 'description': description,
228
+ 'organization': owner,
229
+ 'stars': node.get('stargazerCount', 0),
230
+ 'updated_at': node.get('updatedAt', ''),
231
+ 'language': language,
232
+ 'topics': topics,
233
+ 'license': license_name,
234
+ 'forks': node.get('forkCount', 0),
235
+ 'open_issues': open_issues,
236
+ 'homepage': node.get('homepageUrl'),
237
+ }
238
+ )
239
+
240
+ # Stop if we have enough results
241
+ if len(repo_results) >= num_results:
242
+ break
243
+
244
+ logger.info(f'Found {len(repo_results)} GitHub repositories via GraphQL API')
245
+ return repo_results
246
+
247
+ except Exception as e:
248
+ logger.error(f'GitHub GraphQL search error: {str(e)}')
249
+ return []
250
+
251
+
252
+ def clean_github_url(url: str) -> str:
253
+ """Clean up GitHub URLs to get the main repository URL.
254
+
255
+ For example, convert:
256
+ https://github.com/aws-samples/aws-cdk-examples/blob/main/typescript/api-gateway-lambda/index.ts
257
+ to:
258
+ https://github.com/aws-samples/aws-cdk-examples
259
+
260
+ Args:
261
+ url: The GitHub URL to clean
262
+
263
+ Returns:
264
+ The cleaned GitHub repository URL
265
+ """
266
+ # Basic implementation - can be enhanced for edge cases
267
+ if 'github.com' not in url:
268
+ return url
269
+
270
+ parts = url.split('github.com/')
271
+ if len(parts) < 2:
272
+ return url
273
+
274
+ repo_path = parts[1]
275
+ # Extract org/repo part (first two segments)
276
+ repo_segments = repo_path.split('/')
277
+ if len(repo_segments) >= 2:
278
+ return f'https://github.com/{repo_segments[0]}/{repo_segments[1]}'
279
+
280
+ return url
281
+
282
+
283
+ def extract_org_from_url(url: str) -> Optional[str]:
284
+ """Extract organization name from GitHub URL.
285
+
286
+ Args:
287
+ url: The GitHub URL to extract the organization from
288
+
289
+ Returns:
290
+ The organization name, or None if not found
291
+ """
292
+ if 'github.com' not in url:
293
+ return None
294
+
295
+ parts = url.split('github.com/')
296
+ if len(parts) < 2:
297
+ return None
298
+
299
+ repo_path = parts[1]
300
+ org = repo_path.split('/')[0]
301
+ return org
302
+
303
+
304
+ def github_repo_search_rest(
305
+ keywords: List[str],
306
+ organizations: List[str],
307
+ num_results: int = 5,
308
+ license_filter: Optional[List[str]] = None,
309
+ ) -> List[Dict[str, Any]]:
310
+ """Search GitHub repositories using the REST API.
311
+
312
+ This is a fallback for when GraphQL API is rate limited and no token is provided.
313
+
314
+ Args:
315
+ keywords: List of keywords to search for
316
+ organizations: List of GitHub organizations to scope the search to
317
+ num_results: Number of results to return
318
+ license_filter: Optional list of license names to filter repositories by
319
+
320
+ Returns:
321
+ List of GitHub repositories matching the search criteria
322
+ """
323
+ repo_results = []
324
+ processed_urls = set()
325
+
326
+ # Process each organization separately
327
+ for org in organizations:
328
+ try:
329
+ # Build the search query for this organization
330
+ keyword_string = '+OR+'.join(keywords)
331
+ query_string = f'{keyword_string}+org:{org}'
332
+
333
+ logger.info(f'Searching GitHub REST API for org {org}')
334
+
335
+ # Make the REST API request
336
+ response = requests.get(
337
+ f'https://api.github.com/search/repositories?q={query_string}&sort=stars&order=desc&per_page={num_results}',
338
+ headers={'Accept': 'application/vnd.github.v3+json'},
339
+ timeout=10, # Add 10 second timeout to prevent hanging requests
340
+ )
341
+
342
+ # Check for errors
343
+ response.raise_for_status()
344
+
345
+ # Parse the response
346
+ data = response.json()
347
+ items = data.get('items', [])
348
+
349
+ # Process each repository
350
+ for item in items:
351
+ repo_url = item.get('html_url', '')
352
+
353
+ # Skip if we've already processed this URL
354
+ if repo_url in processed_urls:
355
+ continue
356
+
357
+ processed_urls.add(repo_url)
358
+
359
+ # Extract license information if available
360
+ license_info = item.get('license')
361
+ license_name = license_info.get('name') if license_info else None
362
+
363
+ # Skip if license filter is specified and this repository's license doesn't match
364
+ if license_filter and license_name and license_name not in license_filter:
365
+ continue
366
+
367
+ # Extract topics if available
368
+ topics = item.get('topics', [])
369
+
370
+ # Add to results with additional metadata
371
+ repo_results.append(
372
+ {
373
+ 'url': repo_url,
374
+ 'title': item.get('full_name', ''),
375
+ 'description': item.get('description', ''),
376
+ 'organization': org,
377
+ 'stars': item.get('stargazers_count', 0),
378
+ 'updated_at': item.get('updated_at', ''),
379
+ 'language': item.get('language'),
380
+ 'topics': topics,
381
+ 'license': license_name,
382
+ 'forks': item.get('forks_count', 0),
383
+ 'open_issues': item.get('open_issues_count', 0),
384
+ 'homepage': item.get('homepage'),
385
+ }
386
+ )
387
+
388
+ # Stop if we have enough results
389
+ if len(repo_results) >= num_results:
390
+ break
391
+
392
+ # Add a small delay between requests to avoid rate limiting
393
+ time.sleep(1)
394
+
395
+ except Exception as e:
396
+ logger.error(f'GitHub REST API error for org {org}: {str(e)}')
397
+ continue
398
+
399
+ logger.info(f'Found {len(repo_results)} GitHub repositories via REST API')
400
+ return repo_results
401
+
402
+
403
+ def github_repo_search_wrapper(**kwargs) -> List[Dict[str, Any]]:
404
+ """Wrapper for GitHub API search that returns GitHub repository results.
405
+
406
+ Args:
407
+ **kwargs: Keyword arguments including:
408
+ - keywords: List of keywords to search for
409
+ - organizations: List of GitHub organizations to scope the search to
410
+ - num_results: Number of results to return
411
+
412
+ Returns:
413
+ List of GitHub repositories matching the search criteria
414
+ """
415
+ # Extract keywords from kwargs
416
+ if 'args' in kwargs:
417
+ keywords = kwargs['args']
418
+ elif 'keywords' in kwargs:
419
+ keywords = kwargs['keywords']
420
+ else:
421
+ # Convert all values to strings and split by spaces
422
+ keywords_str = ' '.join(str(value) for value in kwargs.values())
423
+ keywords = keywords_str.split()
424
+
425
+ # Ensure keywords is a list
426
+ if isinstance(keywords, str):
427
+ keywords = keywords.split()
428
+
429
+ # Get organizations to search in
430
+ organizations = kwargs.get(
431
+ 'organizations', ['aws-samples', 'aws-solutions-library-samples', 'awslabs']
432
+ )
433
+ num_results = kwargs.get('num_results', 5)
434
+ license_filter = kwargs.get('license_filter')
435
+
436
+ # Get GitHub token from environment variable
437
+ token = os.environ.get('GITHUB_TOKEN')
438
+
439
+ try:
440
+ # GraphQL API requires authentication, so only use it if token is provided
441
+ if token:
442
+ logger.info('Using authenticated GitHub GraphQL API')
443
+ results = github_repo_search_graphql(
444
+ keywords=keywords,
445
+ organizations=organizations,
446
+ num_results=num_results,
447
+ token=token,
448
+ license_filter=license_filter,
449
+ )
450
+ # Always use REST API for unauthenticated requests
451
+ else:
452
+ logger.info('Using unauthenticated GitHub REST API (GraphQL requires auth)')
453
+ results = github_repo_search_rest(
454
+ keywords=keywords,
455
+ organizations=organizations,
456
+ num_results=num_results,
457
+ license_filter=license_filter,
458
+ )
459
+
460
+ # Sort results by stars (descending) and then by updated_at date
461
+ results.sort(
462
+ key=lambda x: (
463
+ -(x.get('stars', 0) or 0), # Sort by stars descending
464
+ x.get('updated_at', ''), # Then by updated_at
465
+ )
466
+ )
467
+
468
+ return results
469
+ except Exception as e:
470
+ logger.error(f'GitHub repository search error: {str(e)}')
471
+ return []