awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awslabs/__init__.py +12 -0
- awslabs/git_repo_research_mcp_server/__init__.py +13 -0
- awslabs/git_repo_research_mcp_server/defaults.py +347 -0
- awslabs/git_repo_research_mcp_server/embeddings.py +66 -0
- awslabs/git_repo_research_mcp_server/github_search.py +471 -0
- awslabs/git_repo_research_mcp_server/indexer.py +860 -0
- awslabs/git_repo_research_mcp_server/models.py +291 -0
- awslabs/git_repo_research_mcp_server/repository.py +321 -0
- awslabs/git_repo_research_mcp_server/search.py +350 -0
- awslabs/git_repo_research_mcp_server/server.py +914 -0
- awslabs/git_repo_research_mcp_server/utils.py +396 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/METADATA +190 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/RECORD +17 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/WHEEL +4 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/entry_points.txt +2 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/LICENSE +175 -0
- awslabs_git_repo_research_mcp_server-0.0.1.dist-info/licenses/NOTICE +2 -0
|
@@ -0,0 +1,860 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
|
|
4
|
+
# with the License. A copy of the License is located at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
|
|
9
|
+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
|
|
10
|
+
# and limitations under the License.
|
|
11
|
+
"""FAISS indexing for Git Repository Research MCP Server using LangChain.
|
|
12
|
+
|
|
13
|
+
This module provides functionality for creating and managing FAISS indices
|
|
14
|
+
for Git repositories using LangChain's FAISS implementation.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import faiss
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import shutil
|
|
21
|
+
import time
|
|
22
|
+
from awslabs.git_repo_research_mcp_server.defaults import Constants
|
|
23
|
+
from awslabs.git_repo_research_mcp_server.embeddings import get_embedding_model
|
|
24
|
+
from awslabs.git_repo_research_mcp_server.models import (
|
|
25
|
+
EmbeddingModel,
|
|
26
|
+
IndexMetadata,
|
|
27
|
+
IndexRepositoryResponse,
|
|
28
|
+
)
|
|
29
|
+
from awslabs.git_repo_research_mcp_server.repository import (
|
|
30
|
+
cleanup_repository,
|
|
31
|
+
clone_repository,
|
|
32
|
+
get_repository_name,
|
|
33
|
+
is_git_repo,
|
|
34
|
+
is_git_url,
|
|
35
|
+
process_repository,
|
|
36
|
+
)
|
|
37
|
+
from datetime import datetime
|
|
38
|
+
from git import Repo
|
|
39
|
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
|
40
|
+
from langchain_community.vectorstores import FAISS
|
|
41
|
+
from langchain_core.documents import Document
|
|
42
|
+
from loguru import logger
|
|
43
|
+
from pydantic import BaseModel, field_validator
|
|
44
|
+
from pydantic_core.core_schema import ValidationInfo
|
|
45
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class RepositoryConfig(BaseModel):
|
|
49
|
+
"""Configuration for repository indexing.
|
|
50
|
+
|
|
51
|
+
This class defines the configuration parameters for indexing a Git repository,
|
|
52
|
+
including paths, patterns for file inclusion/exclusion, and chunking parameters.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
repository_path: str
|
|
56
|
+
output_path: Optional[str] = None
|
|
57
|
+
include_patterns: Optional[List[str]] = None
|
|
58
|
+
exclude_patterns: Optional[List[str]] = None
|
|
59
|
+
chunk_size: int = 1000
|
|
60
|
+
chunk_overlap: int = 200
|
|
61
|
+
|
|
62
|
+
@field_validator('repository_path')
|
|
63
|
+
@classmethod
|
|
64
|
+
def validate_repository_path(cls, git_string_url):
|
|
65
|
+
"""Validate the repository path.
|
|
66
|
+
|
|
67
|
+
:param git_string_url: Git URL or local path
|
|
68
|
+
:return: Validated repository path.
|
|
69
|
+
"""
|
|
70
|
+
if not (is_git_url(git_string_url) or os.path.exists(git_string_url)):
|
|
71
|
+
raise ValueError('Repository path must be a valid Git URL or existing local path')
|
|
72
|
+
return git_string_url
|
|
73
|
+
|
|
74
|
+
@field_validator('chunk_size')
|
|
75
|
+
@classmethod
|
|
76
|
+
def validate_chunk_size(cls, chunk_size):
|
|
77
|
+
"""Validate the chunk size.
|
|
78
|
+
|
|
79
|
+
:param chunk_size: Chunk size value
|
|
80
|
+
:return: Validated chunk size.
|
|
81
|
+
"""
|
|
82
|
+
if chunk_size <= 0:
|
|
83
|
+
raise ValueError('Chunk size must be positive')
|
|
84
|
+
return chunk_size
|
|
85
|
+
|
|
86
|
+
@field_validator('chunk_overlap')
|
|
87
|
+
@classmethod
|
|
88
|
+
def validate_chunk_overlap(cls, v: int, info: ValidationInfo) -> int:
|
|
89
|
+
"""Validate the chunk overlap.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
v: Chunk overlap value
|
|
93
|
+
info: Validation context information
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Validated chunk overlap value.
|
|
97
|
+
"""
|
|
98
|
+
chunk_size = info.data.get('chunk_size', None)
|
|
99
|
+
if chunk_size is not None and v >= chunk_size:
|
|
100
|
+
raise ValueError('Chunk overlap must be less than chunk size')
|
|
101
|
+
return v
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class IndexConfig(BaseModel):
|
|
105
|
+
"""Configuration for the indexing process.
|
|
106
|
+
|
|
107
|
+
This class defines the configuration parameters for the indexing process,
|
|
108
|
+
including the embedding model and AWS-specific settings.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
embedding_model: str
|
|
112
|
+
aws_region: Optional[str] = None
|
|
113
|
+
aws_profile: Optional[str] = None
|
|
114
|
+
index_dir: Optional[str] = None
|
|
115
|
+
|
|
116
|
+
@field_validator('embedding_model')
|
|
117
|
+
@classmethod
|
|
118
|
+
def validate_embedding_model(cls, embedding_model):
|
|
119
|
+
"""Validate the embedding model.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
embedding_model: AWS embedding model
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Validated embedding model string.
|
|
126
|
+
"""
|
|
127
|
+
# Allow test-model for testing purposes
|
|
128
|
+
if embedding_model == 'test-model':
|
|
129
|
+
return embedding_model
|
|
130
|
+
|
|
131
|
+
if embedding_model not in EmbeddingModel.__members__.values():
|
|
132
|
+
raise ValueError(
|
|
133
|
+
f'Invalid embedding model. Must be one of: {list(EmbeddingModel.__members__.values())}'
|
|
134
|
+
)
|
|
135
|
+
return embedding_model
|
|
136
|
+
|
|
137
|
+
@field_validator('aws_region')
|
|
138
|
+
@classmethod
|
|
139
|
+
def validate_aws_region(cls, aws_region_string):
|
|
140
|
+
"""Validate the AWS region.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
aws_region_string: AWS region string
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Validated AWS region string.
|
|
147
|
+
"""
|
|
148
|
+
# Allow any region format or None
|
|
149
|
+
return aws_region_string
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def get_docstore_dict(docstore):
|
|
153
|
+
"""Safely get the document dictionary from a docstore.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
docstore: LangChain docstore object
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Document dictionary if _dict exists, empty dict otherwise
|
|
160
|
+
"""
|
|
161
|
+
return docstore._dict if hasattr(docstore, '_dict') else {}
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def ensure_docstore_dict(docstore):
|
|
165
|
+
"""Ensure the docstore has a _dict attribute.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
docstore: LangChain docstore object
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
The docstore's _dict (creating it if needed)
|
|
172
|
+
"""
|
|
173
|
+
if not hasattr(docstore, '_dict'):
|
|
174
|
+
docstore._dict = {}
|
|
175
|
+
return docstore._dict
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_docstore_dict_size(docstore):
|
|
179
|
+
"""Safely get the size of the document dictionary from a docstore.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
docstore: LangChain docstore object
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Size of document dictionary if _dict exists, 0 otherwise
|
|
186
|
+
"""
|
|
187
|
+
return len(get_docstore_dict(docstore))
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def save_index_without_pickle(vector_store, index_path):
|
|
191
|
+
"""Save FAISS index without using pickle.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
vector_store: FAISS vector store
|
|
195
|
+
index_path: Path to save the index
|
|
196
|
+
|
|
197
|
+
This function saves a FAISS index using FAISS's native methods and JSON
|
|
198
|
+
instead of pickle for serialization.
|
|
199
|
+
"""
|
|
200
|
+
os.makedirs(index_path, exist_ok=True)
|
|
201
|
+
|
|
202
|
+
# 1. Save FAISS index using faiss's native methods
|
|
203
|
+
faiss_path = os.path.join(index_path, 'index.faiss')
|
|
204
|
+
faiss.write_index(vector_store.index, faiss_path)
|
|
205
|
+
|
|
206
|
+
# 2. Save docstore as JSON
|
|
207
|
+
docstore_path = os.path.join(index_path, 'docstore.json')
|
|
208
|
+
docstore_data = {}
|
|
209
|
+
for doc_id, doc in get_docstore_dict(vector_store.docstore).items():
|
|
210
|
+
docstore_data[doc_id] = {'page_content': doc.page_content, 'metadata': doc.metadata}
|
|
211
|
+
|
|
212
|
+
with open(docstore_path, 'w') as f:
|
|
213
|
+
json.dump(docstore_data, f)
|
|
214
|
+
|
|
215
|
+
# 3. Save index_to_docstore_id mapping as JSON
|
|
216
|
+
mapping_path = os.path.join(index_path, 'index_mapping.json')
|
|
217
|
+
# Convert numeric keys to strings for JSON serialization
|
|
218
|
+
mapping = {str(k): v for k, v in vector_store.index_to_docstore_id.items()}
|
|
219
|
+
with open(mapping_path, 'w') as f:
|
|
220
|
+
json.dump(mapping, f)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def save_chunk_map_without_pickle(chunk_map, index_path):
|
|
224
|
+
"""Save chunk map without using pickle.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
chunk_map: Chunk map to save
|
|
228
|
+
index_path: Path to save the chunk map
|
|
229
|
+
|
|
230
|
+
This function saves a chunk map using JSON instead of pickle for serialization.
|
|
231
|
+
"""
|
|
232
|
+
# Convert the chunk map to a JSON-serializable format
|
|
233
|
+
serializable_chunk_map = {'chunks': chunk_map['chunks'], 'chunk_to_file': {}}
|
|
234
|
+
|
|
235
|
+
# Convert the chunk_to_file dictionary to a serializable format
|
|
236
|
+
# Since chunks are not hashable in JSON, we use indices
|
|
237
|
+
for i, chunk in enumerate(chunk_map['chunks']):
|
|
238
|
+
if chunk in chunk_map['chunk_to_file']:
|
|
239
|
+
serializable_chunk_map['chunk_to_file'][str(i)] = chunk_map['chunk_to_file'][chunk]
|
|
240
|
+
|
|
241
|
+
# Save as JSON
|
|
242
|
+
chunk_map_path = os.path.join(index_path, 'chunk_map.json')
|
|
243
|
+
with open(chunk_map_path, 'w') as f:
|
|
244
|
+
json.dump(serializable_chunk_map, f)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def load_chunk_map_without_pickle(index_path):
|
|
248
|
+
"""Load chunk map without using pickle.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
index_path: Path to the chunk map
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
Chunk map dictionary if found, None otherwise
|
|
255
|
+
|
|
256
|
+
This function loads a chunk map using JSON instead of pickle for serialization.
|
|
257
|
+
"""
|
|
258
|
+
chunk_map_path = os.path.join(index_path, 'chunk_map.json')
|
|
259
|
+
|
|
260
|
+
if not os.path.exists(chunk_map_path):
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
with open(chunk_map_path, 'r') as f:
|
|
265
|
+
serialized_map = json.load(f)
|
|
266
|
+
|
|
267
|
+
# Reconstruct the chunk-to-file mapping
|
|
268
|
+
chunks = serialized_map['chunks']
|
|
269
|
+
chunk_to_file = {}
|
|
270
|
+
for i, chunk in enumerate(chunks):
|
|
271
|
+
if str(i) in serialized_map['chunk_to_file']:
|
|
272
|
+
chunk_to_file[chunk] = serialized_map['chunk_to_file'][str(i)]
|
|
273
|
+
|
|
274
|
+
return {'chunks': chunks, 'chunk_to_file': chunk_to_file}
|
|
275
|
+
except Exception as e:
|
|
276
|
+
logger.error(f'Error loading chunk map: {e}')
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class RepositoryIndexer:
|
|
281
|
+
"""Indexer for Git repositories using LangChain's FAISS implementation.
|
|
282
|
+
|
|
283
|
+
This class provides methods for creating and managing FAISS indices
|
|
284
|
+
for Git repositories.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
def __init__(self, config: IndexConfig):
|
|
288
|
+
"""Initialize the repository indexer.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
config: IndexConfig object with indexer configuration
|
|
292
|
+
"""
|
|
293
|
+
self.embedding_model = config.embedding_model
|
|
294
|
+
self.aws_region = config.aws_region
|
|
295
|
+
self.aws_profile = config.aws_profile
|
|
296
|
+
self.index_dir = config.index_dir or os.path.expanduser(f'~/{Constants.DEFAULT_INDEX_DIR}')
|
|
297
|
+
|
|
298
|
+
# Create the index directory if it doesn't exist
|
|
299
|
+
os.makedirs(self.index_dir, exist_ok=True)
|
|
300
|
+
|
|
301
|
+
# Initialize the embedding generator
|
|
302
|
+
self.embedding_generator = get_embedding_model(
|
|
303
|
+
model_id=self.embedding_model,
|
|
304
|
+
aws_region=self.aws_region,
|
|
305
|
+
aws_profile=self.aws_profile,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
def _get_index_path(self, repository_name: str) -> str:
|
|
309
|
+
"""Get the path to the index directory for a repository.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
repository_name: Name of the repository
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Path to the index directory
|
|
316
|
+
"""
|
|
317
|
+
# Sanitize the repository name for use in a filename
|
|
318
|
+
sanitized_name = ''.join(c if c.isalnum() or c in '-_' else '_' for c in repository_name)
|
|
319
|
+
return os.path.join(self.index_dir, sanitized_name)
|
|
320
|
+
|
|
321
|
+
def _get_metadata_path(self, repository_name: str) -> str:
|
|
322
|
+
"""Get the path to the metadata file for a repository.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
repository_name: Name of the repository
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Path to the metadata file
|
|
329
|
+
"""
|
|
330
|
+
# Store metadata file in the repository's index directory
|
|
331
|
+
index_path = self._get_index_path(repository_name)
|
|
332
|
+
return os.path.join(index_path, 'metadata.json')
|
|
333
|
+
|
|
334
|
+
def _get_chunk_map_path(self, repository_name: str) -> str:
|
|
335
|
+
"""Get the path to the chunk map file for a repository.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
repository_name: Name of the repository
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Path to the chunk map file
|
|
342
|
+
"""
|
|
343
|
+
# Store chunk map file in the repository's index directory
|
|
344
|
+
index_path = self._get_index_path(repository_name)
|
|
345
|
+
return os.path.join(index_path, 'chunk_map.json')
|
|
346
|
+
|
|
347
|
+
async def index_repository(
|
|
348
|
+
self,
|
|
349
|
+
config: RepositoryConfig,
|
|
350
|
+
ctx: Optional[Any] = None,
|
|
351
|
+
) -> IndexRepositoryResponse:
|
|
352
|
+
"""Index a Git repository.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
config: RepositoryConfig object with indexing configuration
|
|
356
|
+
ctx: Context object for progress tracking (optional)
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
IndexRepositoryResponse object with information about the created index
|
|
360
|
+
|
|
361
|
+
Raises:
|
|
362
|
+
Exception: If indexing fails
|
|
363
|
+
"""
|
|
364
|
+
start_time = time.time()
|
|
365
|
+
temp_dir = None
|
|
366
|
+
|
|
367
|
+
try:
|
|
368
|
+
# Initialize helper classes
|
|
369
|
+
repo_processor = RepositoryProcessor()
|
|
370
|
+
index_builder = IndexBuilder()
|
|
371
|
+
file_manager = FileManager()
|
|
372
|
+
metadata_manager = MetadataManager()
|
|
373
|
+
|
|
374
|
+
# Step 1: Repository preparation and processing
|
|
375
|
+
repo_path, repository_name, temp_dir = await repo_processor.prepare_repository(
|
|
376
|
+
config.repository_path, ctx
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
if ctx:
|
|
380
|
+
await ctx.report_progress(0, 100)
|
|
381
|
+
|
|
382
|
+
chunks, chunk_to_file, extension_stats = await repo_processor.process_content(
|
|
383
|
+
repo_path, config, ctx
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
if not chunks:
|
|
387
|
+
logger.warning('No text chunks found in repository')
|
|
388
|
+
if ctx:
|
|
389
|
+
await ctx.info('No text chunks found in repository')
|
|
390
|
+
await ctx.report_progress(100, 100)
|
|
391
|
+
return IndexRepositoryResponse(
|
|
392
|
+
status='error',
|
|
393
|
+
repository_name=repository_name,
|
|
394
|
+
repository_path=config.repository_path,
|
|
395
|
+
index_path='',
|
|
396
|
+
repository_directory=repo_path,
|
|
397
|
+
file_count=0,
|
|
398
|
+
chunk_count=0,
|
|
399
|
+
embedding_model=self.embedding_model,
|
|
400
|
+
execution_time_ms=int((time.time() - start_time) * 1000),
|
|
401
|
+
message='No text chunks found in repository',
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Step 2: Index creation
|
|
405
|
+
documents = await index_builder.create_documents(chunks, chunk_to_file, ctx)
|
|
406
|
+
index_path = self._get_index_path(config.output_path or repository_name)
|
|
407
|
+
repo_files_path = os.path.join(index_path, 'repository')
|
|
408
|
+
os.makedirs(repo_files_path, exist_ok=True)
|
|
409
|
+
|
|
410
|
+
# Step 3: File management
|
|
411
|
+
await file_manager.copy_repository_files(repo_path, repo_files_path, ctx)
|
|
412
|
+
vector_store = await index_builder.create_vector_store(
|
|
413
|
+
documents, self.embedding_generator, ctx
|
|
414
|
+
)
|
|
415
|
+
index_builder.save_index(vector_store, index_path)
|
|
416
|
+
|
|
417
|
+
# Save chunk map
|
|
418
|
+
chunk_map_data = {'chunks': chunks, 'chunk_to_file': chunk_to_file}
|
|
419
|
+
file_manager.save_chunk_map(chunk_map_data, index_path)
|
|
420
|
+
|
|
421
|
+
# Step 4: Metadata management
|
|
422
|
+
last_commit_id = await repo_processor.get_commit_id(
|
|
423
|
+
repo_path, repository_name, config.repository_path
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
metadata = await metadata_manager.create_and_save(
|
|
427
|
+
{
|
|
428
|
+
'repository_name': repository_name,
|
|
429
|
+
'config': config,
|
|
430
|
+
'index_path': index_path,
|
|
431
|
+
'repo_files_path': repo_files_path,
|
|
432
|
+
'chunks': chunks,
|
|
433
|
+
'chunk_to_file': chunk_to_file,
|
|
434
|
+
'extension_stats': extension_stats,
|
|
435
|
+
'last_commit_id': last_commit_id,
|
|
436
|
+
'embedding_model': self.embedding_model,
|
|
437
|
+
},
|
|
438
|
+
ctx,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Return success response
|
|
442
|
+
execution_time_ms = int((time.time() - start_time) * 1000)
|
|
443
|
+
logger.info(f'Indexing completed in {execution_time_ms}ms')
|
|
444
|
+
|
|
445
|
+
if ctx:
|
|
446
|
+
await ctx.info(f'Indexing completed in {execution_time_ms}ms')
|
|
447
|
+
await ctx.report_progress(100, 100)
|
|
448
|
+
|
|
449
|
+
return IndexRepositoryResponse(
|
|
450
|
+
status='success',
|
|
451
|
+
repository_name=metadata.repository_name,
|
|
452
|
+
repository_path=config.repository_path,
|
|
453
|
+
index_path=index_path,
|
|
454
|
+
repository_directory=repo_files_path,
|
|
455
|
+
file_count=metadata.file_count,
|
|
456
|
+
chunk_count=metadata.chunk_count,
|
|
457
|
+
embedding_model=self.embedding_model,
|
|
458
|
+
execution_time_ms=execution_time_ms,
|
|
459
|
+
message=f'Successfully indexed repository with {metadata.file_count} files and {metadata.chunk_count} chunks',
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
except Exception as e:
|
|
463
|
+
logger.error(f'Error indexing repository: {e}')
|
|
464
|
+
error_message = f'Error indexing repository: {str(e)}'
|
|
465
|
+
|
|
466
|
+
if ctx:
|
|
467
|
+
await ctx.error(error_message)
|
|
468
|
+
await ctx.report_progress(100, 100)
|
|
469
|
+
|
|
470
|
+
return IndexRepositoryResponse(
|
|
471
|
+
status='error',
|
|
472
|
+
repository_name=get_repository_name(config.repository_path),
|
|
473
|
+
repository_path=config.repository_path,
|
|
474
|
+
index_path='',
|
|
475
|
+
repository_directory=locals().get('repo_path'),
|
|
476
|
+
file_count=0,
|
|
477
|
+
chunk_count=0,
|
|
478
|
+
embedding_model=self.embedding_model,
|
|
479
|
+
execution_time_ms=int((time.time() - start_time) * 1000),
|
|
480
|
+
message=error_message,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
finally:
|
|
484
|
+
if temp_dir:
|
|
485
|
+
cleanup_repository(temp_dir)
|
|
486
|
+
|
|
487
|
+
def load_index_without_pickle(self, index_path):
|
|
488
|
+
"""Load FAISS index without using pickle.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
index_path: Path to the index
|
|
492
|
+
embedding_function: Embedding function to use
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
FAISS vector store
|
|
496
|
+
|
|
497
|
+
This function loads a FAISS index using FAISS's native methods and JSON
|
|
498
|
+
instead of pickle for serialization.
|
|
499
|
+
"""
|
|
500
|
+
# 1. Load FAISS index using faiss's native methods
|
|
501
|
+
faiss_path = os.path.join(index_path, 'index.faiss')
|
|
502
|
+
index = faiss.read_index(faiss_path)
|
|
503
|
+
|
|
504
|
+
# 2. Load docstore from JSON
|
|
505
|
+
docstore_path = os.path.join(index_path, 'docstore.json')
|
|
506
|
+
with open(docstore_path, 'r') as f:
|
|
507
|
+
docstore_data = json.load(f)
|
|
508
|
+
|
|
509
|
+
# Reconstruct the document store
|
|
510
|
+
docstore = InMemoryDocstore({})
|
|
511
|
+
for doc_id, doc_data in docstore_data.items():
|
|
512
|
+
dict_obj = ensure_docstore_dict(docstore)
|
|
513
|
+
dict_obj[doc_id] = Document(
|
|
514
|
+
page_content=doc_data['page_content'], metadata=doc_data['metadata']
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# 3. Load index_to_docstore_id mapping from JSON
|
|
518
|
+
mapping_path = os.path.join(index_path, 'index_mapping.json')
|
|
519
|
+
with open(mapping_path, 'r') as f:
|
|
520
|
+
mapping_data = json.load(f)
|
|
521
|
+
|
|
522
|
+
# Convert string keys back to integers for the mapping
|
|
523
|
+
index_to_docstore_id = {int(k): v for k, v in mapping_data.items()}
|
|
524
|
+
|
|
525
|
+
# 4. Create and return the FAISS vector store
|
|
526
|
+
return FAISS(
|
|
527
|
+
embedding_function=self.embedding_generator,
|
|
528
|
+
index=index,
|
|
529
|
+
docstore=docstore,
|
|
530
|
+
index_to_docstore_id=index_to_docstore_id,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
class RepositoryProcessor:
|
|
535
|
+
"""Handles repository-specific operations for indexing."""
|
|
536
|
+
|
|
537
|
+
async def prepare_repository(
|
|
538
|
+
self, repository_path: str, ctx: Optional[Any] = None
|
|
539
|
+
) -> Tuple[str, str, Optional[str]]:
|
|
540
|
+
"""Prepare the repository for indexing.
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
repository_path: Path or URL to the repository
|
|
544
|
+
ctx: Context object for progress tracking (optional)
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
Tuple containing:
|
|
548
|
+
- Path to the repository
|
|
549
|
+
- Name of the repository
|
|
550
|
+
- Temporary directory if created (for cleanup), None otherwise
|
|
551
|
+
"""
|
|
552
|
+
temp_dir = None
|
|
553
|
+
# If the repository path is a URL, clone it
|
|
554
|
+
if is_git_url(repository_path):
|
|
555
|
+
logger.info(f'Cloning repository from {repository_path}')
|
|
556
|
+
if ctx:
|
|
557
|
+
await ctx.info(f'Cloning repository from {repository_path}')
|
|
558
|
+
temp_dir = clone_repository(repository_path)
|
|
559
|
+
repo_path = temp_dir
|
|
560
|
+
else:
|
|
561
|
+
repo_path = repository_path
|
|
562
|
+
|
|
563
|
+
# Get the repository name
|
|
564
|
+
repository_name = get_repository_name(repository_path)
|
|
565
|
+
logger.info(f'Indexing repository: {repository_name}')
|
|
566
|
+
if ctx:
|
|
567
|
+
await ctx.info(f'Indexing repository: {repository_name}')
|
|
568
|
+
|
|
569
|
+
return repo_path, repository_name, temp_dir
|
|
570
|
+
|
|
571
|
+
async def process_content(
|
|
572
|
+
self, repo_path: str, config: RepositoryConfig, ctx: Optional[Any] = None
|
|
573
|
+
) -> Tuple[List[str], Dict[str, str], Dict[str, int]]:
|
|
574
|
+
"""Process repository files to get text chunks.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
repo_path: Path to the repository
|
|
578
|
+
config: Repository configuration
|
|
579
|
+
ctx: Context object for progress tracking (optional)
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
Tuple containing:
|
|
583
|
+
- List of text chunks
|
|
584
|
+
- Mapping of chunks to file paths
|
|
585
|
+
- Statistics about file extensions
|
|
586
|
+
"""
|
|
587
|
+
if ctx:
|
|
588
|
+
await ctx.info('Processing repository files...')
|
|
589
|
+
await ctx.report_progress(10, 100)
|
|
590
|
+
|
|
591
|
+
chunks, chunk_to_file, extension_stats = process_repository(
|
|
592
|
+
repo_path,
|
|
593
|
+
include_patterns=config.include_patterns,
|
|
594
|
+
exclude_patterns=config.exclude_patterns,
|
|
595
|
+
chunk_size=config.chunk_size,
|
|
596
|
+
chunk_overlap=config.chunk_overlap,
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
if ctx:
|
|
600
|
+
await ctx.report_progress(30, 100)
|
|
601
|
+
|
|
602
|
+
return chunks, chunk_to_file, extension_stats
|
|
603
|
+
|
|
604
|
+
async def get_commit_id(
|
|
605
|
+
self, repo_path: str, repository_name: str, repository_path: str
|
|
606
|
+
) -> str:
|
|
607
|
+
"""Get the last commit ID for a repository.
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
repo_path: Path to the repository
|
|
611
|
+
repository_name: Name of the repository
|
|
612
|
+
repository_path: Original path/URL to the repository
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
Last commit ID, or 'unknown' if not available
|
|
616
|
+
"""
|
|
617
|
+
last_commit_id = None
|
|
618
|
+
if is_git_url(repository_path) or is_git_repo(repo_path):
|
|
619
|
+
logger.info(f'Attempting to get last commit ID for {repository_name}')
|
|
620
|
+
|
|
621
|
+
git_dir = os.path.join(repo_path, '.git')
|
|
622
|
+
if os.path.exists(git_dir):
|
|
623
|
+
logger.info(f'.git directory found at {git_dir}')
|
|
624
|
+
try:
|
|
625
|
+
repo = Repo(repo_path)
|
|
626
|
+
if repo.heads:
|
|
627
|
+
last_commit = repo.head.commit
|
|
628
|
+
last_commit_id = last_commit.hexsha
|
|
629
|
+
logger.info(f'Successfully got last commit ID: {last_commit_id}')
|
|
630
|
+
else:
|
|
631
|
+
logger.warning('Repository has no commits')
|
|
632
|
+
except Exception as e:
|
|
633
|
+
logger.warning(f'Error accessing Git repository: {e}')
|
|
634
|
+
logger.exception(e)
|
|
635
|
+
else:
|
|
636
|
+
logger.warning(f'.git directory not found at {git_dir}')
|
|
637
|
+
|
|
638
|
+
return last_commit_id or 'unknown'
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
class IndexBuilder:
|
|
642
|
+
"""Handles FAISS index creation and management."""
|
|
643
|
+
|
|
644
|
+
async def create_documents(
|
|
645
|
+
self, chunks: List[str], chunk_to_file: Dict[str, str], ctx: Optional[Any] = None
|
|
646
|
+
) -> List[Document]:
|
|
647
|
+
"""Convert chunks to LangChain Document objects.
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
chunks: List of text chunks
|
|
651
|
+
chunk_to_file: Mapping of chunks to file paths
|
|
652
|
+
ctx: Context object for progress tracking (optional)
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
List of LangChain Document objects
|
|
656
|
+
"""
|
|
657
|
+
if ctx:
|
|
658
|
+
await ctx.info(f'Converting {len(chunks)} chunks to Document objects...')
|
|
659
|
+
await ctx.report_progress(40, 100)
|
|
660
|
+
|
|
661
|
+
documents = []
|
|
662
|
+
for i, chunk in enumerate(chunks):
|
|
663
|
+
file_path = chunk_to_file.get(chunk, 'unknown')
|
|
664
|
+
documents.append(
|
|
665
|
+
Document(
|
|
666
|
+
page_content=chunk,
|
|
667
|
+
metadata={'source': file_path, 'chunk_id': i},
|
|
668
|
+
)
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
logger.debug(f'Number of documents to embed: {len(documents)}')
|
|
672
|
+
return documents
|
|
673
|
+
|
|
674
|
+
async def create_vector_store(
|
|
675
|
+
self, documents: List[Document], embedding_generator, ctx: Optional[Any] = None
|
|
676
|
+
) -> FAISS:
|
|
677
|
+
"""Create a FAISS vector store from documents.
|
|
678
|
+
|
|
679
|
+
Args:
|
|
680
|
+
documents: List of LangChain Document objects
|
|
681
|
+
embedding_generator: Embedding function to use
|
|
682
|
+
ctx: Context object for progress tracking (optional)
|
|
683
|
+
|
|
684
|
+
Returns:
|
|
685
|
+
FAISS vector store
|
|
686
|
+
"""
|
|
687
|
+
logger.info('Creating FAISS index with LangChain')
|
|
688
|
+
if ctx:
|
|
689
|
+
await ctx.info('Creating FAISS index...')
|
|
690
|
+
await ctx.report_progress(70, 100)
|
|
691
|
+
|
|
692
|
+
logger.debug(f'Using embedding function: {embedding_generator}')
|
|
693
|
+
|
|
694
|
+
# Test the embedding function
|
|
695
|
+
try:
|
|
696
|
+
logger.info('Testing embedding function on sample document...')
|
|
697
|
+
test_content = documents[0].page_content if documents else 'Test content'
|
|
698
|
+
test_result = embedding_generator.embed_documents([test_content])
|
|
699
|
+
logger.info(
|
|
700
|
+
f'Test embedding successful - shape: {len(test_result)}x{len(test_result[0])}'
|
|
701
|
+
)
|
|
702
|
+
except Exception as e:
|
|
703
|
+
logger.error(f'Embedding function test failed: {e}')
|
|
704
|
+
raise
|
|
705
|
+
|
|
706
|
+
if ctx:
|
|
707
|
+
await ctx.info('Generating embeddings and creating vector store...')
|
|
708
|
+
await ctx.report_progress(75, 100)
|
|
709
|
+
|
|
710
|
+
logger.debug(f'Number of documents: {len(documents)}')
|
|
711
|
+
|
|
712
|
+
try:
|
|
713
|
+
vector_store = FAISS.from_documents(
|
|
714
|
+
documents=documents, embedding=embedding_generator, normalize_L2=True
|
|
715
|
+
)
|
|
716
|
+
logger.debug(
|
|
717
|
+
f'Created vector store with {get_docstore_dict_size(vector_store.docstore)} documents'
|
|
718
|
+
)
|
|
719
|
+
return vector_store
|
|
720
|
+
except Exception as e:
|
|
721
|
+
logger.error(f'Error creating vector store: {e}')
|
|
722
|
+
logger.error(f'Document count: {len(documents)}')
|
|
723
|
+
logger.error(
|
|
724
|
+
f'First document content: {documents[0].page_content[:100] if documents else "None"}'
|
|
725
|
+
)
|
|
726
|
+
raise
|
|
727
|
+
|
|
728
|
+
def save_index(self, vector_store: FAISS, index_path: str):
|
|
729
|
+
"""Save FAISS index without using pickle.
|
|
730
|
+
|
|
731
|
+
Args:
|
|
732
|
+
vector_store: FAISS vector store
|
|
733
|
+
index_path: Path to save the index
|
|
734
|
+
"""
|
|
735
|
+
save_index_without_pickle(vector_store, index_path)
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
class FileManager:
|
|
739
|
+
"""Handles file operations for indexing."""
|
|
740
|
+
|
|
741
|
+
async def copy_repository_files(
|
|
742
|
+
self, repo_path: str, repo_files_path: str, ctx: Optional[Any] = None
|
|
743
|
+
) -> int:
|
|
744
|
+
"""Copy all files from the repository to the target directory.
|
|
745
|
+
|
|
746
|
+
Args:
|
|
747
|
+
repo_path: Source repository path
|
|
748
|
+
repo_files_path: Target path for copied files
|
|
749
|
+
ctx: Context object for progress tracking (optional)
|
|
750
|
+
|
|
751
|
+
Returns:
|
|
752
|
+
Number of copied files
|
|
753
|
+
"""
|
|
754
|
+
logger.info(f'Copying all files from {repo_path} to {repo_files_path}')
|
|
755
|
+
if ctx:
|
|
756
|
+
await ctx.info('Copying repository files...')
|
|
757
|
+
await ctx.report_progress(60, 100)
|
|
758
|
+
|
|
759
|
+
# First, ensure the target directory is empty
|
|
760
|
+
if os.path.exists(repo_files_path):
|
|
761
|
+
shutil.rmtree(repo_files_path)
|
|
762
|
+
os.makedirs(repo_files_path, exist_ok=True)
|
|
763
|
+
|
|
764
|
+
copied_files = 0
|
|
765
|
+
for root, dirs, files in os.walk(repo_path):
|
|
766
|
+
if '.git' in root.split(os.sep):
|
|
767
|
+
continue
|
|
768
|
+
|
|
769
|
+
rel_path = os.path.relpath(root, repo_path)
|
|
770
|
+
if rel_path == '.':
|
|
771
|
+
rel_path = ''
|
|
772
|
+
|
|
773
|
+
target_dir = os.path.join(repo_files_path, rel_path)
|
|
774
|
+
os.makedirs(target_dir, exist_ok=True)
|
|
775
|
+
|
|
776
|
+
for file in files:
|
|
777
|
+
source_file = os.path.join(root, file)
|
|
778
|
+
target_file = os.path.join(target_dir, file)
|
|
779
|
+
try:
|
|
780
|
+
shutil.copy2(source_file, target_file)
|
|
781
|
+
copied_files += 1
|
|
782
|
+
except Exception as e:
|
|
783
|
+
logger.warning(f'Error copying file {source_file}: {e}')
|
|
784
|
+
|
|
785
|
+
logger.info(f'Copied {copied_files} files to {repo_files_path}')
|
|
786
|
+
return copied_files
|
|
787
|
+
|
|
788
|
+
def save_chunk_map(self, chunk_map_data: Dict, index_path: str):
|
|
789
|
+
"""Save chunk map without using pickle.
|
|
790
|
+
|
|
791
|
+
Args:
|
|
792
|
+
chunk_map_data: Chunk map to save
|
|
793
|
+
index_path: Path to save the chunk map
|
|
794
|
+
"""
|
|
795
|
+
save_chunk_map_without_pickle(chunk_map_data, index_path)
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
class MetadataManager:
|
|
799
|
+
"""Handles metadata operations for indexing."""
|
|
800
|
+
|
|
801
|
+
async def create_and_save(
|
|
802
|
+
self, params: Dict[str, Any], ctx: Optional[Any] = None
|
|
803
|
+
) -> IndexMetadata:
|
|
804
|
+
"""Create and save metadata for the indexed repository.
|
|
805
|
+
|
|
806
|
+
Args:
|
|
807
|
+
params: Dictionary containing metadata parameters
|
|
808
|
+
ctx: Context object for progress tracking (optional)
|
|
809
|
+
|
|
810
|
+
Returns:
|
|
811
|
+
Created IndexMetadata object
|
|
812
|
+
"""
|
|
813
|
+
if ctx:
|
|
814
|
+
await ctx.info('Finalizing index metadata...')
|
|
815
|
+
await ctx.report_progress(90, 100)
|
|
816
|
+
|
|
817
|
+
# Get index size
|
|
818
|
+
index_size = 0
|
|
819
|
+
for root, _, files in os.walk(params['index_path']):
|
|
820
|
+
for file in files:
|
|
821
|
+
index_size += os.path.getsize(os.path.join(root, file))
|
|
822
|
+
|
|
823
|
+
# Use output_path as repository_name if provided
|
|
824
|
+
final_repo_name = params['config'].output_path or params['repository_name']
|
|
825
|
+
|
|
826
|
+
metadata = IndexMetadata(
|
|
827
|
+
repository_name=final_repo_name,
|
|
828
|
+
repository_path=params['config'].repository_path,
|
|
829
|
+
index_path=params['index_path'],
|
|
830
|
+
created_at=datetime.now(),
|
|
831
|
+
last_accessed=None,
|
|
832
|
+
file_count=len(set(params['chunk_to_file'].values())),
|
|
833
|
+
chunk_count=len(params['chunks']),
|
|
834
|
+
embedding_model=params['embedding_model'],
|
|
835
|
+
file_types=params['extension_stats'],
|
|
836
|
+
total_tokens=None,
|
|
837
|
+
index_size_bytes=index_size,
|
|
838
|
+
last_commit_id=params['last_commit_id'],
|
|
839
|
+
repository_directory=params['repo_files_path'],
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
# Save metadata
|
|
843
|
+
metadata_path = os.path.join(params['index_path'], 'metadata.json')
|
|
844
|
+
metadata_json = metadata.model_dump_json(indent=2)
|
|
845
|
+
with open(metadata_path, 'w') as f:
|
|
846
|
+
f.write(metadata_json)
|
|
847
|
+
|
|
848
|
+
return metadata
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def get_repository_indexer(config: IndexConfig) -> RepositoryIndexer:
|
|
852
|
+
"""Factory method to return a repository indexer.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
config: IndexConfig object with indexer configuration
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
RepositoryIndexer instance
|
|
859
|
+
"""
|
|
860
|
+
return RepositoryIndexer(config)
|