code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,893 @@
1
+ """
2
+ Repository Indexer for Claude Context
3
+
4
+ This module handles discovering, chunking, and indexing code repositories
5
+ for semantic search. Supports multiple repository sources through adapters.
6
+ Follows fail-fast principles with clear logging.
7
+ """
8
+
9
+ import hashlib
10
+ import logging
11
+ from pathlib import Path
12
+ from typing import List, Dict, Any, Optional, Tuple, Union, Iterable
13
+ from datetime import datetime
14
+ import json
15
+ import tempfile
16
+ import time
17
+
18
+ import numpy as np
19
+ from tqdm import tqdm
20
+ from pymilvus import Collection, FieldSchema, CollectionSchema, DataType, MilvusClient
21
+
22
+ from .config import ClaudeContextConfig, MilvusManager
23
+ from .embeddings import LocalEmbeddings
24
+ from .ast_chunker import ASTChunker, CodeChunk as ASTCodeChunk
25
+
26
+ # Import enhanced chunker with fallback to basic chunker
27
+ try:
28
+ from .enhanced_ast_chunker import EnhancedASTChunker, EnhancedCodeChunk, HAS_TREE_SITTER
29
+ HAS_ENHANCED_CHUNKER = HAS_TREE_SITTER
30
+ except ImportError:
31
+ HAS_ENHANCED_CHUNKER = False
32
+
33
+ # Import markdown chunker for README and documentation files
34
+ try:
35
+ from .markdown_chunker import MarkdownChunker, MarkdownChunk
36
+ HAS_MARKDOWN_CHUNKER = True
37
+ except ImportError:
38
+ HAS_MARKDOWN_CHUNKER = False
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ class RepositoryIndexer:
44
+ """
45
+ Index repository files for semantic search.
46
+
47
+ This class handles the complete indexing pipeline:
48
+ 1. Discover files based on extensions and ignore patterns
49
+ 2. Chunk files into semantic units
50
+ 3. Generate embeddings for each chunk
51
+ 4. Store in Milvus for vector search
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ config: ClaudeContextConfig,
57
+ embeddings: LocalEmbeddings,
58
+ milvus_manager: MilvusManager
59
+ ):
60
+ """
61
+ Initialize the indexer.
62
+
63
+ Args:
64
+ config: Configuration with file patterns and chunking settings
65
+ embeddings: Embeddings model for vectorization
66
+ milvus_manager: Milvus connection manager
67
+
68
+ Raises:
69
+ ValueError: If configuration is invalid
70
+ """
71
+ if not config:
72
+ raise ValueError("config is required")
73
+ if not embeddings:
74
+ raise ValueError("embeddings is required")
75
+ if not milvus_manager:
76
+ raise ValueError("milvus_manager is required")
77
+
78
+ self.config = config
79
+ self.embeddings = embeddings
80
+ self.milvus_client = milvus_manager.get_client()
81
+
82
+ # Initialize chunker - prefer enhanced chunker for richer metadata
83
+ self.use_enhanced_chunker = HAS_ENHANCED_CHUNKER
84
+
85
+ if self.use_enhanced_chunker:
86
+ self.enhanced_chunker = EnhancedASTChunker(
87
+ max_chunk_size=config.chunk_size,
88
+ context_mode="full" # Full context for best retrieval
89
+ )
90
+ # Keep basic chunker as fallback
91
+ self.ast_chunker = ASTChunker(
92
+ max_chunk_size=config.chunk_size // 100,
93
+ chunk_overlap=config.chunk_overlap // 100
94
+ )
95
+ logger.info("RepositoryIndexer initialized with ENHANCED AST chunking (+44% metadata richness)")
96
+ else:
97
+ self.ast_chunker = ASTChunker(
98
+ max_chunk_size=config.chunk_size // 100,
99
+ chunk_overlap=config.chunk_overlap // 100
100
+ )
101
+ logger.info("RepositoryIndexer initialized with basic AST chunking")
102
+
103
+ # Initialize markdown chunker for README and documentation files
104
+ self.use_markdown_chunker = HAS_MARKDOWN_CHUNKER
105
+ if self.use_markdown_chunker:
106
+ self.markdown_chunker = MarkdownChunker(
107
+ max_chunk_size=config.chunk_size,
108
+ extract_code_blocks=True,
109
+ include_header_context=True
110
+ )
111
+ logger.info("MarkdownChunker enabled for README/documentation files")
112
+
113
+ # Collection name for this repository
114
+ self.collection_name = "code_chunks"
115
+
116
+ # Track indexing statistics
117
+ self.stats = {
118
+ "files_discovered": 0,
119
+ "files_indexed": 0,
120
+ "chunks_created": 0,
121
+ "chunks_by_type": {}, # Track chunk types
122
+ "errors": [],
123
+ "chunker_type": "enhanced" if self.use_enhanced_chunker else "basic"
124
+ }
125
+
126
+ # Cache of most recently indexed chunks for downstream summarization (trimmed for safety)
127
+ self.last_indexed_chunks: List[Dict[str, Any]] = []
128
+
129
+ def discover_files(self, repo_path: str) -> List[Path]:
130
+ """
131
+ Discover files to index in the repository.
132
+
133
+ Args:
134
+ repo_path: Path to the repository
135
+
136
+ Returns:
137
+ List of file paths to index
138
+
139
+ Raises:
140
+ ValueError: If repo_path doesn't exist or isn't a directory
141
+ """
142
+ repo = Path(repo_path)
143
+
144
+ # Validate path - fail fast
145
+ if not repo.exists():
146
+ raise ValueError(f"Repository path does not exist: {repo_path}")
147
+ if not repo.is_dir():
148
+ raise ValueError(f"Repository path is not a directory: {repo_path}")
149
+
150
+ logger.info(f"Discovering files in: {repo.absolute()}")
151
+ logger.info(f"Looking for extensions: {self.config.file_extensions[:5]}...")
152
+
153
+ discovered_files = []
154
+
155
+ # Discover files by extension
156
+ for ext in self.config.file_extensions:
157
+ pattern = f"**/*{ext}"
158
+ files = list(repo.glob(pattern))
159
+ logger.debug(f"Found {len(files)} files with extension {ext}")
160
+ discovered_files.extend(files)
161
+
162
+ # Remove duplicates
163
+ discovered_files = list(set(discovered_files))
164
+
165
+ # Apply ignore patterns
166
+ filtered_files = []
167
+ for file_path in discovered_files:
168
+ relative_path = str(file_path.relative_to(repo))
169
+
170
+ # Check against ignore patterns
171
+ should_ignore = False
172
+ for pattern in self.config.ignore_patterns:
173
+ if self._matches_pattern(relative_path, pattern):
174
+ logger.debug(f"Ignoring {relative_path} (matches {pattern})")
175
+ should_ignore = True
176
+ break
177
+
178
+ if not should_ignore:
179
+ filtered_files.append(file_path)
180
+
181
+ self.stats["files_discovered"] = len(filtered_files)
182
+ logger.info(f"Discovered {len(filtered_files)} files to index")
183
+
184
+ return sorted(filtered_files) # Sort for consistent ordering
185
+
186
+ def chunk_file(self, file_path: Path) -> List[Dict[str, Any]]:
187
+ """
188
+ Chunk a file into semantic units using appropriate chunker.
189
+
190
+ Uses different chunkers based on file type:
191
+ - Markdown files (.md): MarkdownChunker for section-aware chunking
192
+ - Code files: EnhancedASTChunker or basic ASTChunker
193
+
194
+ Args:
195
+ file_path: Path to the file to chunk
196
+
197
+ Returns:
198
+ List of chunks with metadata
199
+ """
200
+ chunks = []
201
+
202
+ try:
203
+ # Use markdown chunker for .md files
204
+ if self.use_markdown_chunker and file_path.suffix.lower() == '.md':
205
+ return self._chunk_markdown_file(file_path)
206
+
207
+ if self.use_enhanced_chunker:
208
+ # Use enhanced chunker for richer metadata
209
+ enhanced_chunks = self.enhanced_chunker.chunk_file(file_path)
210
+
211
+ if not enhanced_chunks:
212
+ logger.debug(f"No chunks created from {file_path}")
213
+ return []
214
+
215
+ for chunk in enhanced_chunks:
216
+ # Build scope string from scope chain
217
+ scope_str = " > ".join(chunk.scope) if chunk.scope else ""
218
+
219
+ # Build signature info
220
+ signature = chunk.signature or ""
221
+
222
+ # Build imports list
223
+ imports_list = [imp.name for imp in chunk.imports] if chunk.imports else []
224
+
225
+ chunk_dict = {
226
+ "id": self._generate_chunk_id(file_path, chunk.line_range[0], chunk.text),
227
+ "content": chunk.text,
228
+ "contextualized_content": chunk.contextualized_text, # For embeddings
229
+ "file_path": str(file_path),
230
+ "file_name": file_path.name,
231
+ "start_line": chunk.line_range[0] + 1, # Convert to 1-indexed
232
+ "end_line": chunk.line_range[1] + 1,
233
+ "language": chunk.language,
234
+ "chunk_size": chunk.size_chars,
235
+ "chunk_type": chunk.chunk_type,
236
+ "chunk_name": chunk.name or "",
237
+ "parent_context": scope_str, # Full scope chain
238
+ "signature": signature,
239
+ "docstring": chunk.docstring or "",
240
+ "imports": imports_list,
241
+ "return_type": chunk.return_type or "",
242
+ "indexed_at": datetime.now().isoformat()
243
+ }
244
+ chunks.append(chunk_dict)
245
+
246
+ # Track chunk types in stats
247
+ chunk_type = chunk.chunk_type
248
+ if chunk_type not in self.stats["chunks_by_type"]:
249
+ self.stats["chunks_by_type"][chunk_type] = 0
250
+ self.stats["chunks_by_type"][chunk_type] += 1
251
+
252
+ logger.debug(f"Created {len(chunks)} enhanced chunks from {file_path.name}")
253
+
254
+ else:
255
+ # Fallback to basic AST chunker
256
+ ast_chunks = self.ast_chunker.chunk_file(file_path)
257
+
258
+ if not ast_chunks:
259
+ logger.debug(f"No chunks created from {file_path}")
260
+ return []
261
+
262
+ for ast_chunk in ast_chunks:
263
+ chunk_dict = {
264
+ "id": self._generate_chunk_id(file_path, ast_chunk.start_line, ast_chunk.content),
265
+ "content": ast_chunk.content,
266
+ "contextualized_content": ast_chunk.content, # Same as content for basic
267
+ "file_path": str(file_path),
268
+ "file_name": file_path.name,
269
+ "start_line": ast_chunk.start_line,
270
+ "end_line": ast_chunk.end_line,
271
+ "language": ast_chunk.language,
272
+ "chunk_size": ast_chunk.size_chars,
273
+ "chunk_type": ast_chunk.chunk_type,
274
+ "chunk_name": ast_chunk.name or "",
275
+ "parent_context": ast_chunk.parent_context or "",
276
+ "signature": "",
277
+ "docstring": "",
278
+ "imports": [],
279
+ "return_type": "",
280
+ "indexed_at": datetime.now().isoformat()
281
+ }
282
+ chunks.append(chunk_dict)
283
+
284
+ chunk_type = ast_chunk.chunk_type
285
+ if chunk_type not in self.stats["chunks_by_type"]:
286
+ self.stats["chunks_by_type"][chunk_type] = 0
287
+ self.stats["chunks_by_type"][chunk_type] += 1
288
+
289
+ logger.debug(f"Created {len(chunks)} basic chunks from {file_path.name}")
290
+
291
+ except Exception as e:
292
+ logger.error(f"Error chunking file {file_path}: {e}")
293
+ self.stats["errors"].append({"file": str(file_path), "error": str(e)})
294
+ return []
295
+
296
+ return chunks
297
+
298
+ def _chunk_markdown_file(self, file_path: Path) -> List[Dict[str, Any]]:
299
+ """
300
+ Chunk a markdown file using the MarkdownChunker.
301
+
302
+ Extracts:
303
+ - Sections with header hierarchy as scope chains
304
+ - Code blocks as separate chunks with language tags
305
+ - Links and references
306
+
307
+ Args:
308
+ file_path: Path to the markdown file
309
+
310
+ Returns:
311
+ List of chunks with metadata
312
+ """
313
+ chunks = []
314
+
315
+ try:
316
+ md_chunks = self.markdown_chunker.chunk_file(file_path)
317
+
318
+ if not md_chunks:
319
+ logger.debug(f"No chunks created from markdown file {file_path}")
320
+ return []
321
+
322
+ for chunk in md_chunks:
323
+ # Build scope string from header hierarchy
324
+ scope_str = " > ".join(chunk.scope) if chunk.scope else ""
325
+
326
+ # Extract links as JSON for storage
327
+ links_json = json.dumps(chunk.links) if chunk.links else "[]"
328
+
329
+ chunk_dict = {
330
+ "id": self._generate_chunk_id(file_path, chunk.line_range[0], chunk.text),
331
+ "content": chunk.text,
332
+ "contextualized_content": chunk.contextualized_text, # Includes header context
333
+ "file_path": str(file_path),
334
+ "file_name": file_path.name,
335
+ "start_line": chunk.line_range[0] + 1, # Convert to 1-indexed
336
+ "end_line": chunk.line_range[1] + 1,
337
+ "language": chunk.code_language or "markdown",
338
+ "chunk_size": chunk.size_chars,
339
+ "chunk_type": chunk.chunk_type, # 'section' or 'code_block'
340
+ "chunk_name": chunk.name or "",
341
+ "parent_context": scope_str, # Header hierarchy
342
+ "signature": "", # Not applicable for markdown
343
+ "docstring": chunk.text[:500] if chunk.chunk_type == 'section' else "", # Use section text as docstring
344
+ "imports": links_json, # Repurpose imports field for links
345
+ "return_type": "",
346
+ "indexed_at": datetime.now().isoformat()
347
+ }
348
+ chunks.append(chunk_dict)
349
+
350
+ # Track chunk types in stats
351
+ chunk_type = f"md_{chunk.chunk_type}"
352
+ if chunk_type not in self.stats["chunks_by_type"]:
353
+ self.stats["chunks_by_type"][chunk_type] = 0
354
+ self.stats["chunks_by_type"][chunk_type] += 1
355
+
356
+ logger.debug(f"Created {len(chunks)} markdown chunks from {file_path.name}")
357
+
358
+ except Exception as e:
359
+ logger.error(f"Error chunking markdown file {file_path}: {e}")
360
+ self.stats["errors"].append({"file": str(file_path), "error": str(e)})
361
+ return []
362
+
363
+ return chunks
364
+
365
+ def _create_chunk_dict(
366
+ self,
367
+ file_path: Path,
368
+ content: str,
369
+ start_line: int,
370
+ end_line: int
371
+ ) -> Dict[str, Any]:
372
+ """Create a chunk dictionary with metadata."""
373
+ # Generate unique ID for this chunk
374
+ chunk_id = self._generate_chunk_id(file_path, start_line, content)
375
+
376
+ return {
377
+ "id": chunk_id,
378
+ "content": content,
379
+ "file_path": str(file_path),
380
+ "file_name": file_path.name,
381
+ "start_line": start_line,
382
+ "end_line": end_line,
383
+ "language": self._detect_language(file_path),
384
+ "chunk_size": len(content),
385
+ "indexed_at": datetime.now().isoformat()
386
+ }
387
+
388
+ def _generate_chunk_id(self, file_path: Path, start_line: int, content: str) -> str:
389
+ """Generate a unique ID for a chunk."""
390
+ # Use file path, line number, and content hash for uniqueness
391
+ id_string = f"{file_path}:{start_line}:{hashlib.md5(content.encode()).hexdigest()[:8]}"
392
+ return hashlib.sha256(id_string.encode()).hexdigest()[:16]
393
+
394
+ def _detect_language(self, file_path: Path) -> str:
395
+ """Detect programming language from file extension."""
396
+ ext_to_lang = {
397
+ ".py": "python",
398
+ ".js": "javascript",
399
+ ".ts": "typescript",
400
+ ".jsx": "javascript",
401
+ ".tsx": "typescript",
402
+ ".java": "java",
403
+ ".cpp": "cpp",
404
+ ".c": "c",
405
+ ".h": "c",
406
+ ".hpp": "cpp",
407
+ ".go": "go",
408
+ ".rs": "rust",
409
+ ".rb": "ruby",
410
+ ".php": "php",
411
+ ".cs": "csharp",
412
+ ".md": "markdown",
413
+ ".yaml": "yaml",
414
+ ".yml": "yaml",
415
+ ".json": "json",
416
+ ".xml": "xml",
417
+ ".html": "html",
418
+ ".css": "css",
419
+ ".sql": "sql",
420
+ ".sh": "shell",
421
+ ".bash": "bash",
422
+ }
423
+ return ext_to_lang.get(file_path.suffix.lower(), "text")
424
+
425
+ def _matches_pattern(self, path: str, pattern: str) -> bool:
426
+ """Check if a path matches an ignore pattern."""
427
+ import fnmatch
428
+
429
+ # Handle ** wildcards
430
+ if "**" in pattern:
431
+ # Convert ** to Python's glob pattern
432
+ pattern = pattern.replace("**", "*")
433
+
434
+ return fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(f"/{path}", pattern)
435
+
436
+ def _ensure_collection_exists(self) -> None:
437
+ """Ensure the Milvus collection exists without dropping existing data."""
438
+ if not self.milvus_client.has_collection(self.collection_name):
439
+ self.create_collection()
440
+
441
+ def create_collection(self) -> None:
442
+ """
443
+ Create or recreate the Milvus collection for storing chunks.
444
+
445
+ Raises:
446
+ RuntimeError: If collection creation fails
447
+ """
448
+ logger.info(f"Setting up collection: {self.collection_name}")
449
+
450
+ # Check if collection exists
451
+ if self.milvus_client.has_collection(self.collection_name):
452
+ logger.info(f"Dropping existing collection: {self.collection_name}")
453
+ self.milvus_client.drop_collection(self.collection_name)
454
+
455
+ # Create new collection with schema
456
+ dimension = self.embeddings.get_dimension()
457
+
458
+ # Create collection with auto-id and dynamic schema
459
+ self.milvus_client.create_collection(
460
+ collection_name=self.collection_name,
461
+ dimension=dimension,
462
+ metric_type="COSINE", # Cosine similarity for semantic search
463
+ auto_id=True # Let Milvus generate IDs
464
+ )
465
+
466
+ logger.info(f"✅ Collection created: {self.collection_name} (dimension={dimension})")
467
+
468
+ def index_repository(
469
+ self,
470
+ repo_path: str,
471
+ show_progress: bool = True
472
+ ) -> Dict[str, Any]:
473
+ """
474
+ Index an entire repository.
475
+
476
+ Args:
477
+ repo_path: Path to the repository
478
+ show_progress: Whether to show progress bars
479
+
480
+ Returns:
481
+ Dictionary with indexing statistics
482
+
483
+ Raises:
484
+ ValueError: If repo_path is invalid
485
+ RuntimeError: If indexing fails
486
+ """
487
+ start_time = datetime.now()
488
+ logger.info(f"Starting repository indexing: {repo_path}")
489
+
490
+ # Create/recreate collection
491
+ self.create_collection()
492
+
493
+ # Discover files
494
+ files = self.discover_files(repo_path)
495
+ if not files:
496
+ logger.warning("No files found to index")
497
+ return self.stats
498
+
499
+ # Process files
500
+ all_chunks = []
501
+ file_iterator = tqdm(files, desc="Processing files") if show_progress else files
502
+
503
+ for file_path in file_iterator:
504
+ chunks = self.chunk_file(file_path)
505
+ if chunks:
506
+ all_chunks.extend(chunks)
507
+ self.stats["files_indexed"] += 1
508
+
509
+ if not all_chunks:
510
+ logger.warning("No chunks created from files")
511
+ return self.stats
512
+
513
+ # Cache a limited view of indexed chunks for downstream summarization/orchestration
514
+ self.last_indexed_chunks = all_chunks[:200]
515
+
516
+ self.stats["chunks_created"] = len(all_chunks)
517
+ logger.info(f"Created {len(all_chunks)} chunks from {self.stats['files_indexed']} files")
518
+
519
+ # Generate embeddings using contextualized content for better semantic matching
520
+ # contextualized_content includes scope chain, signature, imports - improves retrieval
521
+ logger.info("Generating embeddings (using contextualized content)...")
522
+ chunk_texts = [chunk.get("contextualized_content", chunk["content"]) for chunk in all_chunks]
523
+
524
+ # Process in batches for memory efficiency
525
+ batch_size = 100
526
+ embeddings_list = []
527
+
528
+ batch_iterator = range(0, len(chunk_texts), batch_size)
529
+ if show_progress:
530
+ batch_iterator = tqdm(batch_iterator, desc="Generating embeddings")
531
+
532
+ for i in batch_iterator:
533
+ batch = chunk_texts[i:i + batch_size]
534
+ batch_embeddings = self.embeddings.embed_texts(batch, batch_size=32)
535
+ embeddings_list.append(batch_embeddings)
536
+
537
+ # Combine all embeddings
538
+ all_embeddings = np.vstack(embeddings_list)
539
+ logger.info(f"Generated embeddings: shape={all_embeddings.shape}")
540
+
541
+ # Prepare data for insertion
542
+ logger.info("Inserting into Milvus...")
543
+
544
+ # Prepare data in format Milvus expects
545
+ # Includes new enhanced fields: signature, docstring, imports, return_type
546
+ data = []
547
+ for i, chunk in enumerate(all_chunks):
548
+ # Convert imports list to JSON string for storage
549
+ imports_json = json.dumps(chunk.get("imports", []))
550
+
551
+ data.append({
552
+ # Don't include "id" - let Milvus auto-generate it
553
+ "vector": all_embeddings[i].tolist(),
554
+ "chunk_id": chunk["id"], # Store our ID as metadata
555
+ "content": chunk["content"],
556
+ "file_path": chunk["file_path"],
557
+ "file_name": chunk["file_name"],
558
+ "start_line": chunk["start_line"],
559
+ "end_line": chunk["end_line"],
560
+ "language": chunk["language"],
561
+ "chunk_size": chunk["chunk_size"],
562
+ "chunk_type": chunk.get("chunk_type", "unknown"),
563
+ "chunk_name": chunk.get("chunk_name", ""),
564
+ "parent_context": chunk.get("parent_context", ""),
565
+ # New enhanced fields
566
+ "signature": chunk.get("signature", ""),
567
+ "docstring": chunk.get("docstring", "")[:500] if chunk.get("docstring") else "", # Truncate long docstrings
568
+ "imports": imports_json,
569
+ "return_type": chunk.get("return_type", "")
570
+ })
571
+
572
+ # Insert in batches
573
+ insert_batch_size = 100
574
+ for i in range(0, len(data), insert_batch_size):
575
+ batch = data[i:i + insert_batch_size]
576
+ self.milvus_client.insert(
577
+ collection_name=self.collection_name,
578
+ data=batch
579
+ )
580
+
581
+ logger.info(f"✅ Inserted {len(data)} chunks into Milvus")
582
+
583
+ # Calculate indexing time
584
+ elapsed = (datetime.now() - start_time).total_seconds()
585
+ self.stats["indexing_time_seconds"] = elapsed
586
+
587
+ logger.info(f"Indexing complete in {elapsed:.2f} seconds")
588
+ logger.info(f"Stats: Files: {self.stats['files_indexed']}/{self.stats['files_discovered']}, "
589
+ f"Chunks: {self.stats['chunks_created']}")
590
+ if self.stats["chunks_by_type"]:
591
+ logger.info(f"Chunk types: {self.stats['chunks_by_type']}")
592
+
593
+ return self.stats
594
+
595
+ def index_from_adapter(
596
+ self,
597
+ adapter: Any, # RepositoryAdapter type
598
+ show_progress: bool = True
599
+ ) -> Dict[str, Any]:
600
+ """
601
+ Index a repository using any adapter (flexible source support).
602
+
603
+ This method supports indexing from various sources:
604
+ - Local filesystem (LocalRepositoryAdapter)
605
+ - Git repositories (GitCloneAdapter)
606
+ - GitHub API (GitHubAPIAdapter)
607
+ - In-memory content (MemoryRepositoryAdapter)
608
+
609
+ Args:
610
+ adapter: Repository adapter providing file access
611
+ show_progress: Whether to show progress bars
612
+
613
+ Returns:
614
+ Dictionary with indexing statistics
615
+ """
616
+ start_time = time.time()
617
+ adapter_info = adapter.get_info()
618
+ logger.info(f"Starting indexing from {adapter_info['type']} adapter")
619
+
620
+ # Create/recreate collection
621
+ self.create_collection()
622
+
623
+ # Reset stats
624
+ self.stats = {
625
+ "repository_type": adapter_info["type"],
626
+ "repository_info": adapter_info,
627
+ "files_indexed": 0,
628
+ "chunks_created": 0,
629
+ "chunks_by_type": {},
630
+ "indexing_time": 0.0
631
+ }
632
+
633
+ # Collect chunks from adapter
634
+ all_chunks = []
635
+ files_processed = 0
636
+
637
+ # Get files from adapter
638
+ files_iterator = adapter.get_files(self.config.file_extensions)
639
+ files_list = list(files_iterator) if show_progress else files_iterator
640
+
641
+ if show_progress:
642
+ files_list = tqdm(files_list, desc="Processing files")
643
+
644
+ for file in files_list:
645
+ try:
646
+ # Create temporary file for AST chunker
647
+ # TODO: Refactor AST chunker to accept content strings directly
648
+ with tempfile.NamedTemporaryFile(
649
+ mode='w',
650
+ suffix=Path(file.path).suffix,
651
+ delete=False,
652
+ encoding='utf-8'
653
+ ) as tmp:
654
+ tmp.write(file.content)
655
+ tmp_path = Path(tmp.name)
656
+
657
+ try:
658
+ # Use AST chunker
659
+ chunks = self.chunk_file(tmp_path)
660
+
661
+ # Update chunk metadata with original path
662
+ for chunk in chunks:
663
+ chunk["file_path"] = file.path
664
+ chunk["file_name"] = file.name
665
+
666
+ # Track chunk types
667
+ chunk_type = chunk.get("chunk_type", "unknown")
668
+ self.stats["chunks_by_type"][chunk_type] = \
669
+ self.stats["chunks_by_type"].get(chunk_type, 0) + 1
670
+
671
+ all_chunks.extend(chunks)
672
+ files_processed += 1
673
+
674
+ finally:
675
+ # Clean up temp file
676
+ tmp_path.unlink(missing_ok=True)
677
+
678
+ except Exception as e:
679
+ logger.warning(f"Failed to process {file.path}: {e}")
680
+
681
+ self.stats["files_indexed"] = files_processed
682
+ self.stats["chunks_created"] = len(all_chunks)
683
+
684
+ logger.info(f"Created {len(all_chunks)} chunks from {files_processed} files")
685
+
686
+ if not all_chunks:
687
+ logger.warning("No chunks created from files")
688
+ return self.stats
689
+
690
+ self.last_indexed_chunks = all_chunks[:200]
691
+
692
+ # Generate embeddings using contextualized content for better semantic matching
693
+ logger.info("Generating embeddings (using contextualized content)...")
694
+ chunk_texts = [chunk.get("contextualized_content", chunk["content"]) for chunk in all_chunks]
695
+
696
+ # Process in batches
697
+ batch_size = 100
698
+ embeddings_list = []
699
+
700
+ for i in range(0, len(chunk_texts), batch_size):
701
+ batch = chunk_texts[i:i+batch_size]
702
+ batch_embeddings = self.embeddings.embed_texts(batch, batch_size=32)
703
+ embeddings_list.append(batch_embeddings)
704
+
705
+ all_embeddings = np.vstack(embeddings_list) if embeddings_list else np.array([])
706
+ logger.info(f"Generated embeddings: shape={all_embeddings.shape}")
707
+
708
+ # Prepare data for Milvus with enhanced fields
709
+ data = []
710
+ for i, chunk in enumerate(all_chunks):
711
+ imports_json = json.dumps(chunk.get("imports", []))
712
+
713
+ data.append({
714
+ "vector": all_embeddings[i].tolist(),
715
+ "content": chunk["content"],
716
+ "file_path": chunk["file_path"],
717
+ "file_name": chunk["file_name"],
718
+ "start_line": chunk["start_line"],
719
+ "end_line": chunk["end_line"],
720
+ "language": chunk["language"],
721
+ "chunk_type": chunk.get("chunk_type", "unknown"),
722
+ "chunk_name": chunk.get("chunk_name", ""),
723
+ "parent_context": chunk.get("parent_context", ""),
724
+ "chunk_id": chunk["id"],
725
+ # New enhanced fields
726
+ "signature": chunk.get("signature", ""),
727
+ "docstring": chunk.get("docstring", "")[:500] if chunk.get("docstring") else "",
728
+ "imports": imports_json,
729
+ "return_type": chunk.get("return_type", "")
730
+ })
731
+
732
+ # Insert into Milvus
733
+ logger.info("Inserting into Milvus...")
734
+ self.milvus_client.insert(
735
+ collection_name=self.collection_name,
736
+ data=data
737
+ )
738
+ logger.info(f"✅ Inserted {len(data)} chunks into Milvus")
739
+
740
+ # Calculate final stats
741
+ end_time = time.time()
742
+ self.stats["indexing_time"] = end_time - start_time
743
+
744
+ logger.info(f"Indexing complete in {self.stats['indexing_time']:.2f} seconds")
745
+ logger.info(f"Stats: Files: {files_processed}, Chunks: {len(all_chunks)}")
746
+ logger.info(f"Chunk types: {self.stats['chunks_by_type']}")
747
+
748
+ return self.stats
749
+
750
+ def index_any_repository(
751
+ self,
752
+ source: str,
753
+ show_progress: bool = True,
754
+ **kwargs
755
+ ) -> Dict[str, Any]:
756
+ """
757
+ Index a repository from any supported source.
758
+
759
+ Automatically detects the source type and uses the appropriate adapter.
760
+
761
+ Args:
762
+ source: Repository source (path, URL, or special format)
763
+ show_progress: Whether to show progress bars
764
+ **kwargs: Additional arguments for the adapter
765
+
766
+ Examples:
767
+ # Local repository
768
+ indexer.index_any_repository("/path/to/repo")
769
+
770
+ # Git repository
771
+ indexer.index_any_repository("https://github.com/user/repo.git")
772
+
773
+ # GitHub API
774
+ indexer.index_any_repository("github:facebook/react", branch="main")
775
+
776
+ Returns:
777
+ Indexing statistics
778
+ """
779
+ from .repository_adapter import create_repository_adapter, GitCloneAdapter
780
+
781
+ adapter = create_repository_adapter(source, **kwargs)
782
+
783
+ # Handle adapters that need context management
784
+ if isinstance(adapter, GitCloneAdapter):
785
+ with adapter:
786
+ return self.index_from_adapter(adapter, show_progress)
787
+ else:
788
+ return self.index_from_adapter(adapter, show_progress)
789
+
790
+ def upsert_rationale_entries(self, entries: Iterable[Dict[str, Any]]) -> None:
791
+ """Insert or update rationale records so they can be retrieved alongside code chunks."""
792
+ prepared: List[Dict[str, Any]] = []
793
+ for entry in entries or []:
794
+ if not entry:
795
+ continue
796
+ text = (entry.get("text") or "").strip()
797
+ if not text:
798
+ continue
799
+ prepared.append({**entry, "text": text})
800
+
801
+ if not prepared:
802
+ return
803
+
804
+ self._ensure_collection_exists()
805
+
806
+ texts = [p["text"] for p in prepared]
807
+ batch_size = min(32, len(texts)) or 1
808
+ try:
809
+ vectors = self.embeddings.embed_texts(texts, batch_size=batch_size)
810
+ except Exception as exc: # pragma: no cover - defensive
811
+ logger.error("Failed to embed rationale entries: %s", exc)
812
+ return
813
+
814
+ payloads: List[Dict[str, Any]] = []
815
+ for entry, vector in zip(prepared, vectors):
816
+ record_id = entry.get("record_id") or self._generate_rationale_chunk_id(entry)
817
+
818
+ # Remove previous copy if present
819
+ try:
820
+ self.milvus_client.delete(
821
+ collection_name=self.collection_name,
822
+ filter=f'chunk_id == "{record_id}"'
823
+ )
824
+ except Exception:
825
+ logger.debug("No existing rationale chunk to delete for %s", record_id)
826
+
827
+ payloads.append({
828
+ "vector": vector.tolist(),
829
+ "chunk_id": record_id,
830
+ "content": entry["text"],
831
+ "file_path": entry.get("source_path", ""),
832
+ "file_name": entry.get("source_name", entry.get("title", "")),
833
+ "start_line": entry.get("start_line", 0),
834
+ "end_line": entry.get("end_line", 0),
835
+ "language": entry.get("language", "text"),
836
+ "chunk_size": len(entry["text"]),
837
+ "chunk_type": entry.get("record_type", "rationale"),
838
+ "chunk_name": entry.get("title", ""),
839
+ "parent_context": entry.get("parent_context", ""),
840
+ })
841
+
842
+ if not payloads:
843
+ return
844
+
845
+ self.milvus_client.insert(
846
+ collection_name=self.collection_name,
847
+ data=payloads
848
+ )
849
+ try:
850
+ self.milvus_client.flush(self.collection_name)
851
+ except Exception:
852
+ logger.debug("Milvus flush not available for rationale entries")
853
+
854
+ @staticmethod
855
+ def _generate_rationale_chunk_id(entry: Dict[str, Any]) -> str:
856
+ base = entry.get("record_id") or entry.get("text") or "rationale"
857
+ return hashlib.sha256(base.encode("utf-8")).hexdigest()[:32]
858
+
859
+
860
+ # Example usage
861
+ if __name__ == "__main__":
862
+ print("Testing Repository Indexer")
863
+ print("-" * 50)
864
+
865
+ # Setup
866
+ from .config import ClaudeContextConfig, MilvusManager
867
+ from .embeddings import LocalEmbeddings
868
+
869
+ config = ClaudeContextConfig()
870
+ embeddings = LocalEmbeddings()
871
+
872
+ milvus_manager = MilvusManager(config)
873
+ with milvus_manager:
874
+ indexer = RepositoryIndexer(config, embeddings, milvus_manager)
875
+
876
+ # Test with the claude_context module itself
877
+ test_repo = "./src/claude_context"
878
+
879
+ print(f"\nIndexing: {test_repo}")
880
+ stats = indexer.index_repository(test_repo, show_progress=True)
881
+
882
+ print("\n📊 Indexing Statistics:")
883
+ print(f" Files discovered: {stats['files_discovered']}")
884
+ print(f" Files indexed: {stats['files_indexed']}")
885
+ print(f" Chunks created: {stats['chunks_created']}")
886
+ print(f" Time taken: {stats.get('indexing_time_seconds', 0):.2f}s")
887
+
888
+ if stats["errors"]:
889
+ print(f" Errors: {len(stats['errors'])}")
890
+ for error in stats["errors"][:3]:
891
+ print(f" - {error}")
892
+
893
+ print("\n✅ Indexer test complete!")