awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +12 -0
- awslabs/git_repo_research_mcp_server/__init__.py +13 -0
- awslabs/git_repo_research_mcp_server/defaults.py +347 -0
- awslabs/git_repo_research_mcp_server/embeddings.py +66 -0
- awslabs/git_repo_research_mcp_server/github_search.py +471 -0
- awslabs/git_repo_research_mcp_server/indexer.py +860 -0
- awslabs/git_repo_research_mcp_server/models.py +291 -0
- awslabs/git_repo_research_mcp_server/repository.py +321 -0
- awslabs/git_repo_research_mcp_server/search.py +350 -0
- awslabs/git_repo_research_mcp_server/server.py +914 -0
- awslabs/git_repo_research_mcp_server/utils.py +396 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/METADATA +190 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/RECORD +17 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/WHEEL +4 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/LICENSE +175 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/NOTICE +2 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
4
|
+
# with the License. A copy of the License is located at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
9
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
10
|
+
# and limitations under the License.
|
|
11
|
+
"""Data models for Git Repository Research MCP Server."""
|
|
12
|
+
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from pydantic import BaseModel, Field
|
|
16
|
+
from typing import Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GitHubConfig(BaseModel):
|
|
20
|
+
"""GitHub API configuration.
|
|
21
|
+
|
|
22
|
+
This model defines the configuration for the GitHub API, including
|
|
23
|
+
the optional token for authentication and the API URL.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
token: Optional[str] = Field(None, description='GitHub API token for increased rate limits')
|
|
27
|
+
api_url: str = Field(
|
|
28
|
+
default='https://api.github.com/graphql', description='GitHub GraphQL API URL'
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class IndexMetadata(BaseModel):
|
|
33
|
+
"""Metadata for a repository index.
|
|
34
|
+
|
|
35
|
+
This model stores information about an indexed repository, including
|
|
36
|
+
its location, creation time, and statistics about the indexed content.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
repository_name: str = Field(..., description='Name of the repository')
|
|
40
|
+
repository_path: str = Field(..., description='Path or URL of the repository')
|
|
41
|
+
index_path: str = Field(..., description='Path to the index file')
|
|
42
|
+
created_at: datetime = Field(
|
|
43
|
+
default_factory=datetime.now, description='When the index was created'
|
|
44
|
+
)
|
|
45
|
+
last_accessed: Optional[datetime] = Field(None, description='When the index was last accessed')
|
|
46
|
+
file_count: int = Field(0, description='Number of files indexed')
|
|
47
|
+
chunk_count: int = Field(0, description='Number of text chunks indexed')
|
|
48
|
+
embedding_model: str = Field(..., description='Model used for embeddings')
|
|
49
|
+
file_types: Dict[str, int] = Field(
|
|
50
|
+
default_factory=dict, description='Count of file types indexed'
|
|
51
|
+
)
|
|
52
|
+
total_tokens: Optional[int] = Field(None, description='Total number of tokens processed')
|
|
53
|
+
index_size_bytes: Optional[int] = Field(None, description='Size of the index in bytes')
|
|
54
|
+
last_commit_id: Optional[str] = Field(
|
|
55
|
+
None, description='ID of the last commit in the repository'
|
|
56
|
+
)
|
|
57
|
+
repository_directory: Optional[str] = Field(
|
|
58
|
+
None, description='Path to the cloned repository directory'
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class SearchResult(BaseModel):
|
|
63
|
+
"""Result from a repository search.
|
|
64
|
+
|
|
65
|
+
This model represents a single search result, including the file path,
|
|
66
|
+
relevant content, and similarity score.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
file_path: str = Field(..., description='Path to the file within the repository')
|
|
70
|
+
content: str = Field(..., description='Relevant content snippet')
|
|
71
|
+
score: float = Field(..., description='Similarity score (0-1)')
|
|
72
|
+
line_numbers: Optional[List[int]] = Field(None, description='Line numbers for the content')
|
|
73
|
+
metadata: Optional[Dict[str, str]] = Field(
|
|
74
|
+
None, description='Additional metadata about the result'
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class SearchResponse(BaseModel):
|
|
79
|
+
"""Response from a repository search.
|
|
80
|
+
|
|
81
|
+
This model represents the complete response from a search operation,
|
|
82
|
+
including all matching results and query metadata.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
results: List[SearchResult] = Field(default_factory=list, description='Search results')
|
|
86
|
+
query: str = Field(..., description='Original search query')
|
|
87
|
+
index_path: str = Field(..., description='Path to the index that was searched')
|
|
88
|
+
repository_name: str = Field(..., description='Name of the repository')
|
|
89
|
+
repository_directory: Optional[str] = Field(
|
|
90
|
+
None, description='Path to the cloned repository directory'
|
|
91
|
+
)
|
|
92
|
+
timestamp: datetime = Field(
|
|
93
|
+
default_factory=datetime.now, description='When the search was performed'
|
|
94
|
+
)
|
|
95
|
+
total_results: int = Field(0, description='Total number of results found')
|
|
96
|
+
execution_time_ms: Optional[float] = Field(
|
|
97
|
+
None, description='Search execution time in milliseconds'
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class IndexedRepositoryInfo(BaseModel):
|
|
102
|
+
"""Information about an indexed repository.
|
|
103
|
+
|
|
104
|
+
This model provides a summary of an indexed repository for listing purposes.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
repository_name: str = Field(..., description='Name of the repository')
|
|
108
|
+
repository_path: str = Field(..., description='Path or URL of the repository')
|
|
109
|
+
index_path: str = Field(..., description='Path to the index file')
|
|
110
|
+
repository_directory: Optional[str] = Field(
|
|
111
|
+
None, description='Path to the cloned repository directory'
|
|
112
|
+
)
|
|
113
|
+
created_at: datetime = Field(..., description='When the index was created')
|
|
114
|
+
last_accessed: Optional[datetime] = Field(None, description='When the index was last accessed')
|
|
115
|
+
file_count: int = Field(0, description='Number of files indexed')
|
|
116
|
+
embedding_model: str = Field(..., description='Model used for embeddings')
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class IndexedRepositoriesResponse(BaseModel):
|
|
120
|
+
"""Response containing a list of indexed repositories.
|
|
121
|
+
|
|
122
|
+
This model represents the complete response from a list operation,
|
|
123
|
+
including all indexed repositories and summary statistics.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
repositories: List[IndexedRepositoryInfo] = Field(
|
|
127
|
+
default_factory=list, description='List of indexed repositories'
|
|
128
|
+
)
|
|
129
|
+
total_count: int = Field(0, description='Total number of indexed repositories')
|
|
130
|
+
index_directory: str = Field(..., description='Directory containing the indices')
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class DetailedIndexedRepositoryInfo(IndexedRepositoryInfo):
|
|
134
|
+
"""Detailed information about an indexed repository.
|
|
135
|
+
|
|
136
|
+
This model extends the basic repository info with additional details
|
|
137
|
+
about the indexed content.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
chunk_count: int = Field(0, description='Number of text chunks indexed')
|
|
141
|
+
file_types: Dict[str, int] = Field(
|
|
142
|
+
default_factory=dict, description='Count of file types indexed'
|
|
143
|
+
)
|
|
144
|
+
total_tokens: Optional[int] = Field(None, description='Total number of tokens processed')
|
|
145
|
+
index_size_bytes: Optional[int] = Field(None, description='Size of the index in bytes')
|
|
146
|
+
last_commit_id: Optional[str] = Field(
|
|
147
|
+
None, description='ID of the last commit in the repository'
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class DetailedIndexedRepositoriesResponse(BaseModel):
|
|
152
|
+
"""Response containing detailed information about indexed repositories.
|
|
153
|
+
|
|
154
|
+
This model represents the complete response from a detailed list operation,
|
|
155
|
+
including all indexed repositories with detailed information.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
repositories: List[DetailedIndexedRepositoryInfo] = Field(
|
|
159
|
+
default_factory=list,
|
|
160
|
+
description='List of indexed repositories with detailed information',
|
|
161
|
+
)
|
|
162
|
+
total_count: int = Field(0, description='Total number of indexed repositories')
|
|
163
|
+
index_directory: str = Field(..., description='Directory containing the indices')
|
|
164
|
+
total_index_size_bytes: Optional[int] = Field(
|
|
165
|
+
None, description='Total size of all indices in bytes'
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class EmbeddingModel(str, Enum):
|
|
170
|
+
"""Available embedding models.
|
|
171
|
+
|
|
172
|
+
This enum defines the available embedding models that can be used
|
|
173
|
+
for generating embeddings from repository content.
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
AMAZON_TITAN_EMBED_TEXT_V1 = 'amazon.titan-embed-text-v1'
|
|
177
|
+
AMAZON_TITAN_EMBED_TEXT_V2 = 'amazon.titan-embed-text-v2:0'
|
|
178
|
+
COHERE_EMBED_ENGLISH_V3 = 'cohere.embed-english-v3'
|
|
179
|
+
COHERE_EMBED_MULTILINGUAL_V3 = 'cohere.embed-multilingual-v3'
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class IndexRepositoryResponse(BaseModel):
|
|
183
|
+
"""Response from indexing a repository.
|
|
184
|
+
|
|
185
|
+
This model represents the complete response from an indexing operation,
|
|
186
|
+
including metadata about the created index.
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
status: str = Field(..., description='Status of the indexing operation')
|
|
190
|
+
repository_name: str = Field(..., description='Name of the repository')
|
|
191
|
+
repository_path: str = Field(..., description='Path or URL of the repository')
|
|
192
|
+
index_path: str = Field(..., description='Path to the created index')
|
|
193
|
+
repository_directory: Optional[str] = Field(
|
|
194
|
+
None, description='Path to the cloned repository directory'
|
|
195
|
+
)
|
|
196
|
+
file_count: int = Field(0, description='Number of files indexed')
|
|
197
|
+
chunk_count: int = Field(0, description='Number of text chunks indexed')
|
|
198
|
+
embedding_model: str = Field(..., description='Model used for embeddings')
|
|
199
|
+
execution_time_ms: Optional[float] = Field(
|
|
200
|
+
None, description='Indexing execution time in milliseconds'
|
|
201
|
+
)
|
|
202
|
+
message: Optional[str] = Field(
|
|
203
|
+
None, description='Additional information about the indexing operation'
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class GitHubRepoSearchInput(BaseModel):
|
|
208
|
+
"""Input for GitHub repository search.
|
|
209
|
+
|
|
210
|
+
This model defines the input parameters for searching GitHub repositories
|
|
211
|
+
based on keywords and organizations.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
keywords: List[str] = Field(description='List of keywords to search for GitHub repositories')
|
|
215
|
+
organizations: Optional[List[str]] = Field(
|
|
216
|
+
default=['aws-samples', 'aws-solutions-library-samples', 'awslabs'],
|
|
217
|
+
description='List of GitHub organizations to scope the search to',
|
|
218
|
+
)
|
|
219
|
+
num_results: Optional[int] = Field(default=5, description='Number of results to return')
|
|
220
|
+
license_filter: Optional[List[str]] = Field(
|
|
221
|
+
default=None,
|
|
222
|
+
description="List of licenses to filter by (e.g., 'Apache License 2.0', 'MIT No Attribution')",
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class GitHubRepoSearchResult(BaseModel):
|
|
227
|
+
"""Result from a GitHub repository search.
|
|
228
|
+
|
|
229
|
+
This model represents a single GitHub repository search result.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
url: str = Field(..., description='URL of the GitHub repository')
|
|
233
|
+
title: str = Field(..., description='Title of the repository')
|
|
234
|
+
description: Optional[str] = Field(None, description='Description of the repository')
|
|
235
|
+
organization: str = Field(..., description='GitHub organization that owns the repository')
|
|
236
|
+
stars: Optional[int] = Field(None, description='Number of stars the repository has')
|
|
237
|
+
updated_at: Optional[str] = Field(None, description='When the repository was last updated')
|
|
238
|
+
language: Optional[str] = Field(
|
|
239
|
+
None, description='Primary programming language of the repository'
|
|
240
|
+
)
|
|
241
|
+
topics: Optional[List[str]] = Field(
|
|
242
|
+
None, description='Topics/tags associated with the repository'
|
|
243
|
+
)
|
|
244
|
+
license: Optional[str] = Field(None, description='License of the repository')
|
|
245
|
+
forks: Optional[int] = Field(None, description='Number of forks the repository has')
|
|
246
|
+
open_issues: Optional[int] = Field(None, description='Number of open issues in the repository')
|
|
247
|
+
homepage: Optional[str] = Field(None, description='Homepage URL of the repository')
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class GitHubRepoSearchResponse(BaseModel):
|
|
251
|
+
"""Response from a GitHub repository search.
|
|
252
|
+
|
|
253
|
+
This model represents the complete response from a GitHub repository search.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
status: str = Field(..., description='Status of the search operation')
|
|
257
|
+
query: str = Field(..., description='Original search query')
|
|
258
|
+
organizations: List[str] = Field(..., description='Organizations that were searched')
|
|
259
|
+
results: List[GitHubRepoSearchResult] = Field(
|
|
260
|
+
default_factory=list, description='Search results'
|
|
261
|
+
)
|
|
262
|
+
total_results: int = Field(0, description='Total number of results found')
|
|
263
|
+
execution_time_ms: Optional[float] = Field(
|
|
264
|
+
None, description='Search execution time in milliseconds'
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class DeleteRepositoryResponse(BaseModel):
|
|
269
|
+
"""Response from deleting a repository.
|
|
270
|
+
|
|
271
|
+
This model represents the complete response from a delete operation,
|
|
272
|
+
including status and information about the deleted repository.
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
status: str = Field(
|
|
276
|
+
..., description='Status of the delete operation (success, partial, or error)'
|
|
277
|
+
)
|
|
278
|
+
message: str = Field(..., description='Information about the delete operation')
|
|
279
|
+
repository_name: Optional[str] = Field(None, description='Name of the deleted repository')
|
|
280
|
+
execution_time_ms: Optional[float] = Field(
|
|
281
|
+
None, description='Delete operation execution time in milliseconds'
|
|
282
|
+
)
|
|
283
|
+
deleted_files: Optional[List[str]] = Field(
|
|
284
|
+
None, description='List of files that were successfully deleted'
|
|
285
|
+
)
|
|
286
|
+
errors: Optional[List[str]] = Field(
|
|
287
|
+
None, description='List of errors encountered during deletion'
|
|
288
|
+
)
|
|
289
|
+
permission_issues: Optional[List[str]] = Field(
|
|
290
|
+
None, description='List of files with permission issues'
|
|
291
|
+
)
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
4
|
+
# with the License. A copy of the License is located at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
9
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
10
|
+
# and limitations under the License.
|
|
11
|
+
"""Repository handling for Git Repository Research MCP Server.
|
|
12
|
+
|
|
13
|
+
This module provides functionality for cloning, accessing, and processing
|
|
14
|
+
Git repositories for indexing and searching.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import fnmatch
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import tempfile
|
|
21
|
+
from awslabs.git_repo_research_mcp_server.defaults import Constants
|
|
22
|
+
from git import Repo
|
|
23
|
+
from loguru import logger
|
|
24
|
+
from typing import Dict, List, Optional, Tuple
|
|
25
|
+
from urllib.parse import urlparse
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_git_url(repo_path: str) -> bool:
|
|
29
|
+
"""Check if a string is a Git URL.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
repo_path: Path or URL to check
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
True if the string is a Git URL, False otherwise
|
|
36
|
+
"""
|
|
37
|
+
parsed = urlparse(repo_path)
|
|
38
|
+
return parsed.scheme in ('http', 'https', 'git', 'ssh')
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def is_git_repo(path: str) -> bool:
|
|
42
|
+
"""Check if a path is a Git repository.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
path: Path to check
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
True if the path is a Git repository, False otherwise
|
|
49
|
+
"""
|
|
50
|
+
try:
|
|
51
|
+
Repo(path)
|
|
52
|
+
return True
|
|
53
|
+
except Exception:
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def clone_repository(url: str, target_dir: Optional[str] = None) -> str:
|
|
58
|
+
"""Clone a Git repository from a URL.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
url: URL of the repository to clone
|
|
62
|
+
target_dir: Directory to clone into (optional, uses temp dir if not provided)
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Path to the cloned repository
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
Exception: If cloning fails
|
|
69
|
+
"""
|
|
70
|
+
if target_dir is None:
|
|
71
|
+
target_dir = tempfile.mkdtemp(prefix='git_repo_research_')
|
|
72
|
+
|
|
73
|
+
logger.info(f'Cloning repository from {url} to {target_dir}')
|
|
74
|
+
try:
|
|
75
|
+
# Clone the repository with GitPython
|
|
76
|
+
Repo.clone_from(url, target_dir)
|
|
77
|
+
|
|
78
|
+
# Check if .git directory exists after cloning
|
|
79
|
+
git_dir = os.path.join(target_dir, '.git')
|
|
80
|
+
if os.path.exists(git_dir):
|
|
81
|
+
logger.info(f'.git directory exists at {git_dir}')
|
|
82
|
+
else:
|
|
83
|
+
logger.warning(f'.git directory not found after cloning at {git_dir}')
|
|
84
|
+
# List the contents of the directory to debug
|
|
85
|
+
logger.info(f'Contents of {target_dir}: {os.listdir(target_dir)}')
|
|
86
|
+
|
|
87
|
+
return target_dir
|
|
88
|
+
except Exception as e:
|
|
89
|
+
# Clean up the target directory if it was created
|
|
90
|
+
if os.path.exists(target_dir):
|
|
91
|
+
shutil.rmtree(target_dir, ignore_errors=True)
|
|
92
|
+
logger.error(f'Failed to clone repository: {e}')
|
|
93
|
+
raise
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_repository_name(repo_path: str) -> str:
|
|
97
|
+
"""Get the name of a repository.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
repo_path: Path or URL of the repository
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Name of the repository, including GitHub organization/username if available
|
|
104
|
+
Note: For GitHub repositories, the format is "org_repo" (with underscore)
|
|
105
|
+
instead of "org/repo" for file path compatibility
|
|
106
|
+
"""
|
|
107
|
+
if is_git_url(repo_path):
|
|
108
|
+
# Extract the repository name from the URL
|
|
109
|
+
parsed = urlparse(repo_path)
|
|
110
|
+
path_parts = parsed.path.strip('/').split('/')
|
|
111
|
+
|
|
112
|
+
# Check if this is a GitHub URL with org/username
|
|
113
|
+
if parsed.netloc in ['github.com', 'www.github.com'] and len(path_parts) >= 2:
|
|
114
|
+
# Include the organization/username in the repository name
|
|
115
|
+
org_name = path_parts[-2]
|
|
116
|
+
repo_name = path_parts[-1]
|
|
117
|
+
if repo_name.endswith('.git'):
|
|
118
|
+
repo_name = repo_name[:-4]
|
|
119
|
+
# Use underscore instead of slash for file path compatibility
|
|
120
|
+
return f'{org_name}_{repo_name}'
|
|
121
|
+
else:
|
|
122
|
+
# For non-GitHub URLs or URLs without clear org structure,
|
|
123
|
+
# just use the last part of the path
|
|
124
|
+
repo_name = path_parts[-1]
|
|
125
|
+
if repo_name.endswith('.git'):
|
|
126
|
+
repo_name = repo_name[:-4]
|
|
127
|
+
return repo_name
|
|
128
|
+
else:
|
|
129
|
+
# Use the directory name as the repository name
|
|
130
|
+
return os.path.basename(os.path.abspath(repo_path))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_text_files(
|
|
134
|
+
repo_path: str,
|
|
135
|
+
include_patterns: Optional[List[str]] = None,
|
|
136
|
+
exclude_patterns: Optional[List[str]] = None,
|
|
137
|
+
) -> List[str]:
|
|
138
|
+
"""Get all text files in a repository.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
repo_path: Path to the repository
|
|
142
|
+
include_patterns: Glob patterns for files to include (optional)
|
|
143
|
+
exclude_patterns: Glob patterns for files to exclude (optional)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
List of paths to text files
|
|
147
|
+
"""
|
|
148
|
+
if include_patterns is None:
|
|
149
|
+
include_patterns = Constants.TEXT_FILE_INCLUDE_PATTERNS
|
|
150
|
+
if exclude_patterns is None:
|
|
151
|
+
exclude_patterns = Constants.TEXT_FILE_EXCLUDE_PATTERNS
|
|
152
|
+
|
|
153
|
+
text_files = []
|
|
154
|
+
for root, _, files in os.walk(repo_path):
|
|
155
|
+
for file in files:
|
|
156
|
+
file_path = os.path.join(root, file)
|
|
157
|
+
rel_path = os.path.relpath(file_path, repo_path)
|
|
158
|
+
|
|
159
|
+
# Check if the file matches any include pattern
|
|
160
|
+
included = any(fnmatch.fnmatch(rel_path, pattern) for pattern in include_patterns)
|
|
161
|
+
if not included:
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
# Check if the file matches any exclude pattern
|
|
165
|
+
excluded = any(fnmatch.fnmatch(rel_path, pattern) for pattern in exclude_patterns)
|
|
166
|
+
if excluded:
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# Try to read the file as text
|
|
170
|
+
try:
|
|
171
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
172
|
+
# Read a small sample to check if it's text
|
|
173
|
+
sample = f.read(1024)
|
|
174
|
+
# If we can decode it as UTF-8, it's probably text
|
|
175
|
+
if sample:
|
|
176
|
+
text_files.append(file_path)
|
|
177
|
+
except UnicodeDecodeError:
|
|
178
|
+
# Not a text file
|
|
179
|
+
pass
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.warning(f'Error reading file {file_path}: {e}')
|
|
182
|
+
|
|
183
|
+
return text_files
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def get_file_extension_stats(file_paths: List[str]) -> Dict[str, int]:
|
|
187
|
+
"""Get statistics about file extensions.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
file_paths: List of file paths
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Dictionary mapping file extensions to counts
|
|
194
|
+
"""
|
|
195
|
+
extension_counts = {}
|
|
196
|
+
for file_path in file_paths:
|
|
197
|
+
_, ext = os.path.splitext(file_path)
|
|
198
|
+
if ext:
|
|
199
|
+
# Remove the dot from the extension
|
|
200
|
+
ext = ext[1:].lower()
|
|
201
|
+
extension_counts[ext] = extension_counts.get(ext, 0) + 1
|
|
202
|
+
else:
|
|
203
|
+
extension_counts['no_extension'] = extension_counts.get('no_extension', 0) + 1
|
|
204
|
+
return extension_counts
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def read_file_content(file_path: str) -> str:
|
|
208
|
+
"""Read the content of a file.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
file_path: Path to the file
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Content of the file as a string
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
Exception: If reading fails
|
|
218
|
+
"""
|
|
219
|
+
try:
|
|
220
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
221
|
+
return f.read()
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.error(f'Failed to read file {file_path}: {e}')
|
|
224
|
+
raise
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
|
|
228
|
+
"""Split text into chunks.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
text: Text to split
|
|
232
|
+
chunk_size: Maximum size of each chunk in characters
|
|
233
|
+
chunk_overlap: Overlap between chunks in characters
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List of text chunks
|
|
237
|
+
"""
|
|
238
|
+
if not text or len(text) <= chunk_size:
|
|
239
|
+
return [text] if text else []
|
|
240
|
+
|
|
241
|
+
chunks = []
|
|
242
|
+
start = 0
|
|
243
|
+
while start < len(text):
|
|
244
|
+
end = start + chunk_size
|
|
245
|
+
if end >= len(text):
|
|
246
|
+
chunks.append(text[start:])
|
|
247
|
+
break
|
|
248
|
+
|
|
249
|
+
# Try to find a good breaking point (newline or space)
|
|
250
|
+
break_point = text.rfind('\n', start + chunk_size - chunk_overlap, end)
|
|
251
|
+
if break_point == -1:
|
|
252
|
+
break_point = text.rfind(' ', start + chunk_size - chunk_overlap, end)
|
|
253
|
+
if break_point == -1:
|
|
254
|
+
break_point = end
|
|
255
|
+
|
|
256
|
+
chunks.append(text[start:break_point])
|
|
257
|
+
start = break_point + 1 if text[break_point] in ['\n', ' '] else break_point
|
|
258
|
+
|
|
259
|
+
return chunks
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def process_repository(
|
|
263
|
+
repo_path: str,
|
|
264
|
+
include_patterns: Optional[List[str]] = None,
|
|
265
|
+
exclude_patterns: Optional[List[str]] = None,
|
|
266
|
+
chunk_size: int = 1000,
|
|
267
|
+
chunk_overlap: int = 200,
|
|
268
|
+
) -> Tuple[List[str], Dict[str, str], Dict[str, int]]:
|
|
269
|
+
"""Process a repository for indexing.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
repo_path: Path to the repository
|
|
273
|
+
include_patterns: Glob patterns for files to include (optional)
|
|
274
|
+
exclude_patterns: Glob patterns for files to exclude (optional)
|
|
275
|
+
chunk_size: Maximum size of each chunk in characters
|
|
276
|
+
chunk_overlap: Overlap between chunks in characters
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Tuple containing:
|
|
280
|
+
- List of text chunks
|
|
281
|
+
- Dictionary mapping chunks to file paths
|
|
282
|
+
- Dictionary of file extension statistics
|
|
283
|
+
"""
|
|
284
|
+
logger.info(f'Processing repository at {repo_path}')
|
|
285
|
+
text_files = get_text_files(repo_path, include_patterns, exclude_patterns)
|
|
286
|
+
logger.info(f'Found {len(text_files)} text files')
|
|
287
|
+
|
|
288
|
+
extension_stats = get_file_extension_stats(text_files)
|
|
289
|
+
logger.info(f'File extension statistics: {extension_stats}')
|
|
290
|
+
|
|
291
|
+
chunks = []
|
|
292
|
+
chunk_to_file = {}
|
|
293
|
+
|
|
294
|
+
for file_path in text_files:
|
|
295
|
+
try:
|
|
296
|
+
content = read_file_content(file_path)
|
|
297
|
+
file_chunks = chunk_text(content, chunk_size, chunk_overlap)
|
|
298
|
+
|
|
299
|
+
rel_path = os.path.relpath(file_path, repo_path)
|
|
300
|
+
for chunk in file_chunks:
|
|
301
|
+
chunks.append(chunk)
|
|
302
|
+
chunk_to_file[chunk] = rel_path
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.warning(f'Error processing file {file_path}: {e}')
|
|
305
|
+
|
|
306
|
+
logger.info(f'Created {len(chunks)} text chunks')
|
|
307
|
+
return chunks, chunk_to_file, extension_stats
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def cleanup_repository(repo_path: str) -> None:
|
|
311
|
+
"""Clean up a cloned repository.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
repo_path: Path to the repository
|
|
315
|
+
"""
|
|
316
|
+
if os.path.exists(repo_path) and os.path.isdir(repo_path):
|
|
317
|
+
logger.info(f'Cleaning up repository at {repo_path}')
|
|
318
|
+
try:
|
|
319
|
+
shutil.rmtree(repo_path, ignore_errors=True)
|
|
320
|
+
except Exception as e:
|
|
321
|
+
logger.warning(f'Error cleaning up repository: {e}')
|