awslabs.git-repo-research-mcp-server 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,860 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
4
+ # with the License. A copy of the License is located at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES
9
+ # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions
10
+ # and limitations under the License.
11
+ """FAISS indexing for Git Repository Research MCP Server using LangChain.
12
+
13
+ This module provides functionality for creating and managing FAISS indices
14
+ for Git repositories using LangChain's FAISS implementation.
15
+ """
16
+
17
+ import faiss
18
+ import json
19
+ import os
20
+ import shutil
21
+ import time
22
+ from awslabs.git_repo_research_mcp_server.defaults import Constants
23
+ from awslabs.git_repo_research_mcp_server.embeddings import get_embedding_model
24
+ from awslabs.git_repo_research_mcp_server.models import (
25
+ EmbeddingModel,
26
+ IndexMetadata,
27
+ IndexRepositoryResponse,
28
+ )
29
+ from awslabs.git_repo_research_mcp_server.repository import (
30
+ cleanup_repository,
31
+ clone_repository,
32
+ get_repository_name,
33
+ is_git_repo,
34
+ is_git_url,
35
+ process_repository,
36
+ )
37
+ from datetime import datetime
38
+ from git import Repo
39
+ from langchain_community.docstore.in_memory import InMemoryDocstore
40
+ from langchain_community.vectorstores import FAISS
41
+ from langchain_core.documents import Document
42
+ from loguru import logger
43
+ from pydantic import BaseModel, field_validator
44
+ from pydantic_core.core_schema import ValidationInfo
45
+ from typing import Any, Dict, List, Optional, Tuple
46
+
47
+
48
+ class RepositoryConfig(BaseModel):
49
+ """Configuration for repository indexing.
50
+
51
+ This class defines the configuration parameters for indexing a Git repository,
52
+ including paths, patterns for file inclusion/exclusion, and chunking parameters.
53
+ """
54
+
55
+ repository_path: str
56
+ output_path: Optional[str] = None
57
+ include_patterns: Optional[List[str]] = None
58
+ exclude_patterns: Optional[List[str]] = None
59
+ chunk_size: int = 1000
60
+ chunk_overlap: int = 200
61
+
62
+ @field_validator('repository_path')
63
+ @classmethod
64
+ def validate_repository_path(cls, git_string_url):
65
+ """Validate the repository path.
66
+
67
+ :param git_string_url: Git URL or local path
68
+ :return: Validated repository path.
69
+ """
70
+ if not (is_git_url(git_string_url) or os.path.exists(git_string_url)):
71
+ raise ValueError('Repository path must be a valid Git URL or existing local path')
72
+ return git_string_url
73
+
74
+ @field_validator('chunk_size')
75
+ @classmethod
76
+ def validate_chunk_size(cls, chunk_size):
77
+ """Validate the chunk size.
78
+
79
+ :param chunk_size: Chunk size value
80
+ :return: Validated chunk size.
81
+ """
82
+ if chunk_size <= 0:
83
+ raise ValueError('Chunk size must be positive')
84
+ return chunk_size
85
+
86
+ @field_validator('chunk_overlap')
87
+ @classmethod
88
+ def validate_chunk_overlap(cls, v: int, info: ValidationInfo) -> int:
89
+ """Validate the chunk overlap.
90
+
91
+ Args:
92
+ v: Chunk overlap value
93
+ info: Validation context information
94
+
95
+ Returns:
96
+ Validated chunk overlap value.
97
+ """
98
+ chunk_size = info.data.get('chunk_size', None)
99
+ if chunk_size is not None and v >= chunk_size:
100
+ raise ValueError('Chunk overlap must be less than chunk size')
101
+ return v
102
+
103
+
104
+ class IndexConfig(BaseModel):
105
+ """Configuration for the indexing process.
106
+
107
+ This class defines the configuration parameters for the indexing process,
108
+ including the embedding model and AWS-specific settings.
109
+ """
110
+
111
+ embedding_model: str
112
+ aws_region: Optional[str] = None
113
+ aws_profile: Optional[str] = None
114
+ index_dir: Optional[str] = None
115
+
116
+ @field_validator('embedding_model')
117
+ @classmethod
118
+ def validate_embedding_model(cls, embedding_model):
119
+ """Validate the embedding model.
120
+
121
+ Args:
122
+ embedding_model: AWS embedding model
123
+
124
+ Returns:
125
+ Validated embedding model string.
126
+ """
127
+ # Allow test-model for testing purposes
128
+ if embedding_model == 'test-model':
129
+ return embedding_model
130
+
131
+ if embedding_model not in EmbeddingModel.__members__.values():
132
+ raise ValueError(
133
+ f'Invalid embedding model. Must be one of: {list(EmbeddingModel.__members__.values())}'
134
+ )
135
+ return embedding_model
136
+
137
+ @field_validator('aws_region')
138
+ @classmethod
139
+ def validate_aws_region(cls, aws_region_string):
140
+ """Validate the AWS region.
141
+
142
+ Args:
143
+ aws_region_string: AWS region string
144
+
145
+ Returns:
146
+ Validated AWS region string.
147
+ """
148
+ # Allow any region format or None
149
+ return aws_region_string
150
+
151
+
152
+ def get_docstore_dict(docstore):
153
+ """Safely get the document dictionary from a docstore.
154
+
155
+ Args:
156
+ docstore: LangChain docstore object
157
+
158
+ Returns:
159
+ Document dictionary if _dict exists, empty dict otherwise
160
+ """
161
+ return docstore._dict if hasattr(docstore, '_dict') else {}
162
+
163
+
164
+ def ensure_docstore_dict(docstore):
165
+ """Ensure the docstore has a _dict attribute.
166
+
167
+ Args:
168
+ docstore: LangChain docstore object
169
+
170
+ Returns:
171
+ The docstore's _dict (creating it if needed)
172
+ """
173
+ if not hasattr(docstore, '_dict'):
174
+ docstore._dict = {}
175
+ return docstore._dict
176
+
177
+
178
+ def get_docstore_dict_size(docstore):
179
+ """Safely get the size of the document dictionary from a docstore.
180
+
181
+ Args:
182
+ docstore: LangChain docstore object
183
+
184
+ Returns:
185
+ Size of document dictionary if _dict exists, 0 otherwise
186
+ """
187
+ return len(get_docstore_dict(docstore))
188
+
189
+
190
+ def save_index_without_pickle(vector_store, index_path):
191
+ """Save FAISS index without using pickle.
192
+
193
+ Args:
194
+ vector_store: FAISS vector store
195
+ index_path: Path to save the index
196
+
197
+ This function saves a FAISS index using FAISS's native methods and JSON
198
+ instead of pickle for serialization.
199
+ """
200
+ os.makedirs(index_path, exist_ok=True)
201
+
202
+ # 1. Save FAISS index using faiss's native methods
203
+ faiss_path = os.path.join(index_path, 'index.faiss')
204
+ faiss.write_index(vector_store.index, faiss_path)
205
+
206
+ # 2. Save docstore as JSON
207
+ docstore_path = os.path.join(index_path, 'docstore.json')
208
+ docstore_data = {}
209
+ for doc_id, doc in get_docstore_dict(vector_store.docstore).items():
210
+ docstore_data[doc_id] = {'page_content': doc.page_content, 'metadata': doc.metadata}
211
+
212
+ with open(docstore_path, 'w') as f:
213
+ json.dump(docstore_data, f)
214
+
215
+ # 3. Save index_to_docstore_id mapping as JSON
216
+ mapping_path = os.path.join(index_path, 'index_mapping.json')
217
+ # Convert numeric keys to strings for JSON serialization
218
+ mapping = {str(k): v for k, v in vector_store.index_to_docstore_id.items()}
219
+ with open(mapping_path, 'w') as f:
220
+ json.dump(mapping, f)
221
+
222
+
223
+ def save_chunk_map_without_pickle(chunk_map, index_path):
224
+ """Save chunk map without using pickle.
225
+
226
+ Args:
227
+ chunk_map: Chunk map to save
228
+ index_path: Path to save the chunk map
229
+
230
+ This function saves a chunk map using JSON instead of pickle for serialization.
231
+ """
232
+ # Convert the chunk map to a JSON-serializable format
233
+ serializable_chunk_map = {'chunks': chunk_map['chunks'], 'chunk_to_file': {}}
234
+
235
+ # Convert the chunk_to_file dictionary to a serializable format
236
+ # Since chunks are not hashable in JSON, we use indices
237
+ for i, chunk in enumerate(chunk_map['chunks']):
238
+ if chunk in chunk_map['chunk_to_file']:
239
+ serializable_chunk_map['chunk_to_file'][str(i)] = chunk_map['chunk_to_file'][chunk]
240
+
241
+ # Save as JSON
242
+ chunk_map_path = os.path.join(index_path, 'chunk_map.json')
243
+ with open(chunk_map_path, 'w') as f:
244
+ json.dump(serializable_chunk_map, f)
245
+
246
+
247
+ def load_chunk_map_without_pickle(index_path):
248
+ """Load chunk map without using pickle.
249
+
250
+ Args:
251
+ index_path: Path to the chunk map
252
+
253
+ Returns:
254
+ Chunk map dictionary if found, None otherwise
255
+
256
+ This function loads a chunk map using JSON instead of pickle for serialization.
257
+ """
258
+ chunk_map_path = os.path.join(index_path, 'chunk_map.json')
259
+
260
+ if not os.path.exists(chunk_map_path):
261
+ return None
262
+
263
+ try:
264
+ with open(chunk_map_path, 'r') as f:
265
+ serialized_map = json.load(f)
266
+
267
+ # Reconstruct the chunk-to-file mapping
268
+ chunks = serialized_map['chunks']
269
+ chunk_to_file = {}
270
+ for i, chunk in enumerate(chunks):
271
+ if str(i) in serialized_map['chunk_to_file']:
272
+ chunk_to_file[chunk] = serialized_map['chunk_to_file'][str(i)]
273
+
274
+ return {'chunks': chunks, 'chunk_to_file': chunk_to_file}
275
+ except Exception as e:
276
+ logger.error(f'Error loading chunk map: {e}')
277
+ return None
278
+
279
+
280
+ class RepositoryIndexer:
281
+ """Indexer for Git repositories using LangChain's FAISS implementation.
282
+
283
+ This class provides methods for creating and managing FAISS indices
284
+ for Git repositories.
285
+ """
286
+
287
+ def __init__(self, config: IndexConfig):
288
+ """Initialize the repository indexer.
289
+
290
+ Args:
291
+ config: IndexConfig object with indexer configuration
292
+ """
293
+ self.embedding_model = config.embedding_model
294
+ self.aws_region = config.aws_region
295
+ self.aws_profile = config.aws_profile
296
+ self.index_dir = config.index_dir or os.path.expanduser(f'~/{Constants.DEFAULT_INDEX_DIR}')
297
+
298
+ # Create the index directory if it doesn't exist
299
+ os.makedirs(self.index_dir, exist_ok=True)
300
+
301
+ # Initialize the embedding generator
302
+ self.embedding_generator = get_embedding_model(
303
+ model_id=self.embedding_model,
304
+ aws_region=self.aws_region,
305
+ aws_profile=self.aws_profile,
306
+ )
307
+
308
+ def _get_index_path(self, repository_name: str) -> str:
309
+ """Get the path to the index directory for a repository.
310
+
311
+ Args:
312
+ repository_name: Name of the repository
313
+
314
+ Returns:
315
+ Path to the index directory
316
+ """
317
+ # Sanitize the repository name for use in a filename
318
+ sanitized_name = ''.join(c if c.isalnum() or c in '-_' else '_' for c in repository_name)
319
+ return os.path.join(self.index_dir, sanitized_name)
320
+
321
+ def _get_metadata_path(self, repository_name: str) -> str:
322
+ """Get the path to the metadata file for a repository.
323
+
324
+ Args:
325
+ repository_name: Name of the repository
326
+
327
+ Returns:
328
+ Path to the metadata file
329
+ """
330
+ # Store metadata file in the repository's index directory
331
+ index_path = self._get_index_path(repository_name)
332
+ return os.path.join(index_path, 'metadata.json')
333
+
334
+ def _get_chunk_map_path(self, repository_name: str) -> str:
335
+ """Get the path to the chunk map file for a repository.
336
+
337
+ Args:
338
+ repository_name: Name of the repository
339
+
340
+ Returns:
341
+ Path to the chunk map file
342
+ """
343
+ # Store chunk map file in the repository's index directory
344
+ index_path = self._get_index_path(repository_name)
345
+ return os.path.join(index_path, 'chunk_map.json')
346
+
347
+ async def index_repository(
348
+ self,
349
+ config: RepositoryConfig,
350
+ ctx: Optional[Any] = None,
351
+ ) -> IndexRepositoryResponse:
352
+ """Index a Git repository.
353
+
354
+ Args:
355
+ config: RepositoryConfig object with indexing configuration
356
+ ctx: Context object for progress tracking (optional)
357
+
358
+ Returns:
359
+ IndexRepositoryResponse object with information about the created index
360
+
361
+ Raises:
362
+ Exception: If indexing fails
363
+ """
364
+ start_time = time.time()
365
+ temp_dir = None
366
+
367
+ try:
368
+ # Initialize helper classes
369
+ repo_processor = RepositoryProcessor()
370
+ index_builder = IndexBuilder()
371
+ file_manager = FileManager()
372
+ metadata_manager = MetadataManager()
373
+
374
+ # Step 1: Repository preparation and processing
375
+ repo_path, repository_name, temp_dir = await repo_processor.prepare_repository(
376
+ config.repository_path, ctx
377
+ )
378
+
379
+ if ctx:
380
+ await ctx.report_progress(0, 100)
381
+
382
+ chunks, chunk_to_file, extension_stats = await repo_processor.process_content(
383
+ repo_path, config, ctx
384
+ )
385
+
386
+ if not chunks:
387
+ logger.warning('No text chunks found in repository')
388
+ if ctx:
389
+ await ctx.info('No text chunks found in repository')
390
+ await ctx.report_progress(100, 100)
391
+ return IndexRepositoryResponse(
392
+ status='error',
393
+ repository_name=repository_name,
394
+ repository_path=config.repository_path,
395
+ index_path='',
396
+ repository_directory=repo_path,
397
+ file_count=0,
398
+ chunk_count=0,
399
+ embedding_model=self.embedding_model,
400
+ execution_time_ms=int((time.time() - start_time) * 1000),
401
+ message='No text chunks found in repository',
402
+ )
403
+
404
+ # Step 2: Index creation
405
+ documents = await index_builder.create_documents(chunks, chunk_to_file, ctx)
406
+ index_path = self._get_index_path(config.output_path or repository_name)
407
+ repo_files_path = os.path.join(index_path, 'repository')
408
+ os.makedirs(repo_files_path, exist_ok=True)
409
+
410
+ # Step 3: File management
411
+ await file_manager.copy_repository_files(repo_path, repo_files_path, ctx)
412
+ vector_store = await index_builder.create_vector_store(
413
+ documents, self.embedding_generator, ctx
414
+ )
415
+ index_builder.save_index(vector_store, index_path)
416
+
417
+ # Save chunk map
418
+ chunk_map_data = {'chunks': chunks, 'chunk_to_file': chunk_to_file}
419
+ file_manager.save_chunk_map(chunk_map_data, index_path)
420
+
421
+ # Step 4: Metadata management
422
+ last_commit_id = await repo_processor.get_commit_id(
423
+ repo_path, repository_name, config.repository_path
424
+ )
425
+
426
+ metadata = await metadata_manager.create_and_save(
427
+ {
428
+ 'repository_name': repository_name,
429
+ 'config': config,
430
+ 'index_path': index_path,
431
+ 'repo_files_path': repo_files_path,
432
+ 'chunks': chunks,
433
+ 'chunk_to_file': chunk_to_file,
434
+ 'extension_stats': extension_stats,
435
+ 'last_commit_id': last_commit_id,
436
+ 'embedding_model': self.embedding_model,
437
+ },
438
+ ctx,
439
+ )
440
+
441
+ # Return success response
442
+ execution_time_ms = int((time.time() - start_time) * 1000)
443
+ logger.info(f'Indexing completed in {execution_time_ms}ms')
444
+
445
+ if ctx:
446
+ await ctx.info(f'Indexing completed in {execution_time_ms}ms')
447
+ await ctx.report_progress(100, 100)
448
+
449
+ return IndexRepositoryResponse(
450
+ status='success',
451
+ repository_name=metadata.repository_name,
452
+ repository_path=config.repository_path,
453
+ index_path=index_path,
454
+ repository_directory=repo_files_path,
455
+ file_count=metadata.file_count,
456
+ chunk_count=metadata.chunk_count,
457
+ embedding_model=self.embedding_model,
458
+ execution_time_ms=execution_time_ms,
459
+ message=f'Successfully indexed repository with {metadata.file_count} files and {metadata.chunk_count} chunks',
460
+ )
461
+
462
+ except Exception as e:
463
+ logger.error(f'Error indexing repository: {e}')
464
+ error_message = f'Error indexing repository: {str(e)}'
465
+
466
+ if ctx:
467
+ await ctx.error(error_message)
468
+ await ctx.report_progress(100, 100)
469
+
470
+ return IndexRepositoryResponse(
471
+ status='error',
472
+ repository_name=get_repository_name(config.repository_path),
473
+ repository_path=config.repository_path,
474
+ index_path='',
475
+ repository_directory=locals().get('repo_path'),
476
+ file_count=0,
477
+ chunk_count=0,
478
+ embedding_model=self.embedding_model,
479
+ execution_time_ms=int((time.time() - start_time) * 1000),
480
+ message=error_message,
481
+ )
482
+
483
+ finally:
484
+ if temp_dir:
485
+ cleanup_repository(temp_dir)
486
+
487
+ def load_index_without_pickle(self, index_path):
488
+ """Load FAISS index without using pickle.
489
+
490
+ Args:
491
+ index_path: Path to the index
492
+ embedding_function: Embedding function to use
493
+
494
+ Returns:
495
+ FAISS vector store
496
+
497
+ This function loads a FAISS index using FAISS's native methods and JSON
498
+ instead of pickle for serialization.
499
+ """
500
+ # 1. Load FAISS index using faiss's native methods
501
+ faiss_path = os.path.join(index_path, 'index.faiss')
502
+ index = faiss.read_index(faiss_path)
503
+
504
+ # 2. Load docstore from JSON
505
+ docstore_path = os.path.join(index_path, 'docstore.json')
506
+ with open(docstore_path, 'r') as f:
507
+ docstore_data = json.load(f)
508
+
509
+ # Reconstruct the document store
510
+ docstore = InMemoryDocstore({})
511
+ for doc_id, doc_data in docstore_data.items():
512
+ dict_obj = ensure_docstore_dict(docstore)
513
+ dict_obj[doc_id] = Document(
514
+ page_content=doc_data['page_content'], metadata=doc_data['metadata']
515
+ )
516
+
517
+ # 3. Load index_to_docstore_id mapping from JSON
518
+ mapping_path = os.path.join(index_path, 'index_mapping.json')
519
+ with open(mapping_path, 'r') as f:
520
+ mapping_data = json.load(f)
521
+
522
+ # Convert string keys back to integers for the mapping
523
+ index_to_docstore_id = {int(k): v for k, v in mapping_data.items()}
524
+
525
+ # 4. Create and return the FAISS vector store
526
+ return FAISS(
527
+ embedding_function=self.embedding_generator,
528
+ index=index,
529
+ docstore=docstore,
530
+ index_to_docstore_id=index_to_docstore_id,
531
+ )
532
+
533
+
534
+ class RepositoryProcessor:
535
+ """Handles repository-specific operations for indexing."""
536
+
537
+ async def prepare_repository(
538
+ self, repository_path: str, ctx: Optional[Any] = None
539
+ ) -> Tuple[str, str, Optional[str]]:
540
+ """Prepare the repository for indexing.
541
+
542
+ Args:
543
+ repository_path: Path or URL to the repository
544
+ ctx: Context object for progress tracking (optional)
545
+
546
+ Returns:
547
+ Tuple containing:
548
+ - Path to the repository
549
+ - Name of the repository
550
+ - Temporary directory if created (for cleanup), None otherwise
551
+ """
552
+ temp_dir = None
553
+ # If the repository path is a URL, clone it
554
+ if is_git_url(repository_path):
555
+ logger.info(f'Cloning repository from {repository_path}')
556
+ if ctx:
557
+ await ctx.info(f'Cloning repository from {repository_path}')
558
+ temp_dir = clone_repository(repository_path)
559
+ repo_path = temp_dir
560
+ else:
561
+ repo_path = repository_path
562
+
563
+ # Get the repository name
564
+ repository_name = get_repository_name(repository_path)
565
+ logger.info(f'Indexing repository: {repository_name}')
566
+ if ctx:
567
+ await ctx.info(f'Indexing repository: {repository_name}')
568
+
569
+ return repo_path, repository_name, temp_dir
570
+
571
+ async def process_content(
572
+ self, repo_path: str, config: RepositoryConfig, ctx: Optional[Any] = None
573
+ ) -> Tuple[List[str], Dict[str, str], Dict[str, int]]:
574
+ """Process repository files to get text chunks.
575
+
576
+ Args:
577
+ repo_path: Path to the repository
578
+ config: Repository configuration
579
+ ctx: Context object for progress tracking (optional)
580
+
581
+ Returns:
582
+ Tuple containing:
583
+ - List of text chunks
584
+ - Mapping of chunks to file paths
585
+ - Statistics about file extensions
586
+ """
587
+ if ctx:
588
+ await ctx.info('Processing repository files...')
589
+ await ctx.report_progress(10, 100)
590
+
591
+ chunks, chunk_to_file, extension_stats = process_repository(
592
+ repo_path,
593
+ include_patterns=config.include_patterns,
594
+ exclude_patterns=config.exclude_patterns,
595
+ chunk_size=config.chunk_size,
596
+ chunk_overlap=config.chunk_overlap,
597
+ )
598
+
599
+ if ctx:
600
+ await ctx.report_progress(30, 100)
601
+
602
+ return chunks, chunk_to_file, extension_stats
603
+
604
+ async def get_commit_id(
605
+ self, repo_path: str, repository_name: str, repository_path: str
606
+ ) -> str:
607
+ """Get the last commit ID for a repository.
608
+
609
+ Args:
610
+ repo_path: Path to the repository
611
+ repository_name: Name of the repository
612
+ repository_path: Original path/URL to the repository
613
+
614
+ Returns:
615
+ Last commit ID, or 'unknown' if not available
616
+ """
617
+ last_commit_id = None
618
+ if is_git_url(repository_path) or is_git_repo(repo_path):
619
+ logger.info(f'Attempting to get last commit ID for {repository_name}')
620
+
621
+ git_dir = os.path.join(repo_path, '.git')
622
+ if os.path.exists(git_dir):
623
+ logger.info(f'.git directory found at {git_dir}')
624
+ try:
625
+ repo = Repo(repo_path)
626
+ if repo.heads:
627
+ last_commit = repo.head.commit
628
+ last_commit_id = last_commit.hexsha
629
+ logger.info(f'Successfully got last commit ID: {last_commit_id}')
630
+ else:
631
+ logger.warning('Repository has no commits')
632
+ except Exception as e:
633
+ logger.warning(f'Error accessing Git repository: {e}')
634
+ logger.exception(e)
635
+ else:
636
+ logger.warning(f'.git directory not found at {git_dir}')
637
+
638
+ return last_commit_id or 'unknown'
639
+
640
+
641
+ class IndexBuilder:
642
+ """Handles FAISS index creation and management."""
643
+
644
+ async def create_documents(
645
+ self, chunks: List[str], chunk_to_file: Dict[str, str], ctx: Optional[Any] = None
646
+ ) -> List[Document]:
647
+ """Convert chunks to LangChain Document objects.
648
+
649
+ Args:
650
+ chunks: List of text chunks
651
+ chunk_to_file: Mapping of chunks to file paths
652
+ ctx: Context object for progress tracking (optional)
653
+
654
+ Returns:
655
+ List of LangChain Document objects
656
+ """
657
+ if ctx:
658
+ await ctx.info(f'Converting {len(chunks)} chunks to Document objects...')
659
+ await ctx.report_progress(40, 100)
660
+
661
+ documents = []
662
+ for i, chunk in enumerate(chunks):
663
+ file_path = chunk_to_file.get(chunk, 'unknown')
664
+ documents.append(
665
+ Document(
666
+ page_content=chunk,
667
+ metadata={'source': file_path, 'chunk_id': i},
668
+ )
669
+ )
670
+
671
+ logger.debug(f'Number of documents to embed: {len(documents)}')
672
+ return documents
673
+
674
+ async def create_vector_store(
675
+ self, documents: List[Document], embedding_generator, ctx: Optional[Any] = None
676
+ ) -> FAISS:
677
+ """Create a FAISS vector store from documents.
678
+
679
+ Args:
680
+ documents: List of LangChain Document objects
681
+ embedding_generator: Embedding function to use
682
+ ctx: Context object for progress tracking (optional)
683
+
684
+ Returns:
685
+ FAISS vector store
686
+ """
687
+ logger.info('Creating FAISS index with LangChain')
688
+ if ctx:
689
+ await ctx.info('Creating FAISS index...')
690
+ await ctx.report_progress(70, 100)
691
+
692
+ logger.debug(f'Using embedding function: {embedding_generator}')
693
+
694
+ # Test the embedding function
695
+ try:
696
+ logger.info('Testing embedding function on sample document...')
697
+ test_content = documents[0].page_content if documents else 'Test content'
698
+ test_result = embedding_generator.embed_documents([test_content])
699
+ logger.info(
700
+ f'Test embedding successful - shape: {len(test_result)}x{len(test_result[0])}'
701
+ )
702
+ except Exception as e:
703
+ logger.error(f'Embedding function test failed: {e}')
704
+ raise
705
+
706
+ if ctx:
707
+ await ctx.info('Generating embeddings and creating vector store...')
708
+ await ctx.report_progress(75, 100)
709
+
710
+ logger.debug(f'Number of documents: {len(documents)}')
711
+
712
+ try:
713
+ vector_store = FAISS.from_documents(
714
+ documents=documents, embedding=embedding_generator, normalize_L2=True
715
+ )
716
+ logger.debug(
717
+ f'Created vector store with {get_docstore_dict_size(vector_store.docstore)} documents'
718
+ )
719
+ return vector_store
720
+ except Exception as e:
721
+ logger.error(f'Error creating vector store: {e}')
722
+ logger.error(f'Document count: {len(documents)}')
723
+ logger.error(
724
+ f'First document content: {documents[0].page_content[:100] if documents else "None"}'
725
+ )
726
+ raise
727
+
728
+ def save_index(self, vector_store: FAISS, index_path: str):
729
+ """Save FAISS index without using pickle.
730
+
731
+ Args:
732
+ vector_store: FAISS vector store
733
+ index_path: Path to save the index
734
+ """
735
+ save_index_without_pickle(vector_store, index_path)
736
+
737
+
738
+ class FileManager:
739
+ """Handles file operations for indexing."""
740
+
741
+ async def copy_repository_files(
742
+ self, repo_path: str, repo_files_path: str, ctx: Optional[Any] = None
743
+ ) -> int:
744
+ """Copy all files from the repository to the target directory.
745
+
746
+ Args:
747
+ repo_path: Source repository path
748
+ repo_files_path: Target path for copied files
749
+ ctx: Context object for progress tracking (optional)
750
+
751
+ Returns:
752
+ Number of copied files
753
+ """
754
+ logger.info(f'Copying all files from {repo_path} to {repo_files_path}')
755
+ if ctx:
756
+ await ctx.info('Copying repository files...')
757
+ await ctx.report_progress(60, 100)
758
+
759
+ # First, ensure the target directory is empty
760
+ if os.path.exists(repo_files_path):
761
+ shutil.rmtree(repo_files_path)
762
+ os.makedirs(repo_files_path, exist_ok=True)
763
+
764
+ copied_files = 0
765
+ for root, dirs, files in os.walk(repo_path):
766
+ if '.git' in root.split(os.sep):
767
+ continue
768
+
769
+ rel_path = os.path.relpath(root, repo_path)
770
+ if rel_path == '.':
771
+ rel_path = ''
772
+
773
+ target_dir = os.path.join(repo_files_path, rel_path)
774
+ os.makedirs(target_dir, exist_ok=True)
775
+
776
+ for file in files:
777
+ source_file = os.path.join(root, file)
778
+ target_file = os.path.join(target_dir, file)
779
+ try:
780
+ shutil.copy2(source_file, target_file)
781
+ copied_files += 1
782
+ except Exception as e:
783
+ logger.warning(f'Error copying file {source_file}: {e}')
784
+
785
+ logger.info(f'Copied {copied_files} files to {repo_files_path}')
786
+ return copied_files
787
+
788
+ def save_chunk_map(self, chunk_map_data: Dict, index_path: str):
789
+ """Save chunk map without using pickle.
790
+
791
+ Args:
792
+ chunk_map_data: Chunk map to save
793
+ index_path: Path to save the chunk map
794
+ """
795
+ save_chunk_map_without_pickle(chunk_map_data, index_path)
796
+
797
+
798
+ class MetadataManager:
799
+ """Handles metadata operations for indexing."""
800
+
801
+ async def create_and_save(
802
+ self, params: Dict[str, Any], ctx: Optional[Any] = None
803
+ ) -> IndexMetadata:
804
+ """Create and save metadata for the indexed repository.
805
+
806
+ Args:
807
+ params: Dictionary containing metadata parameters
808
+ ctx: Context object for progress tracking (optional)
809
+
810
+ Returns:
811
+ Created IndexMetadata object
812
+ """
813
+ if ctx:
814
+ await ctx.info('Finalizing index metadata...')
815
+ await ctx.report_progress(90, 100)
816
+
817
+ # Get index size
818
+ index_size = 0
819
+ for root, _, files in os.walk(params['index_path']):
820
+ for file in files:
821
+ index_size += os.path.getsize(os.path.join(root, file))
822
+
823
+ # Use output_path as repository_name if provided
824
+ final_repo_name = params['config'].output_path or params['repository_name']
825
+
826
+ metadata = IndexMetadata(
827
+ repository_name=final_repo_name,
828
+ repository_path=params['config'].repository_path,
829
+ index_path=params['index_path'],
830
+ created_at=datetime.now(),
831
+ last_accessed=None,
832
+ file_count=len(set(params['chunk_to_file'].values())),
833
+ chunk_count=len(params['chunks']),
834
+ embedding_model=params['embedding_model'],
835
+ file_types=params['extension_stats'],
836
+ total_tokens=None,
837
+ index_size_bytes=index_size,
838
+ last_commit_id=params['last_commit_id'],
839
+ repository_directory=params['repo_files_path'],
840
+ )
841
+
842
+ # Save metadata
843
+ metadata_path = os.path.join(params['index_path'], 'metadata.json')
844
+ metadata_json = metadata.model_dump_json(indent=2)
845
+ with open(metadata_path, 'w') as f:
846
+ f.write(metadata_json)
847
+
848
+ return metadata
849
+
850
+
851
+ def get_repository_indexer(config: IndexConfig) -> RepositoryIndexer:
852
+ """Factory method to return a repository indexer.
853
+
854
+ Args:
855
+ config: IndexConfig object with indexer configuration
856
+
857
+ Returns:
858
+ RepositoryIndexer instance
859
+ """
860
+ return RepositoryIndexer(config)