code-finder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. claude_context/__init__.py +33 -0
  2. claude_context/agentic_integration.py +309 -0
  3. claude_context/ast_chunker.py +646 -0
  4. claude_context/config.py +239 -0
  5. claude_context/context_manager.py +627 -0
  6. claude_context/embeddings.py +307 -0
  7. claude_context/embeddings_interface.py +226 -0
  8. claude_context/enhanced_ast_chunker.py +1129 -0
  9. claude_context/explorer.py +951 -0
  10. claude_context/explorer_with_context.py +1008 -0
  11. claude_context/indexer.py +893 -0
  12. claude_context/markdown_chunker.py +421 -0
  13. claude_context/mode_handler.py +1774 -0
  14. claude_context/query_metrics.py +164 -0
  15. claude_context/question_generator.py +800 -0
  16. claude_context/readme_extractor.py +485 -0
  17. claude_context/repository_adapter.py +399 -0
  18. claude_context/search.py +493 -0
  19. claude_context/skills/__init__.py +11 -0
  20. claude_context/skills/_cli_common.py +74 -0
  21. claude_context/skills/_index_manager.py +98 -0
  22. claude_context/skills/api_surface.py +219 -0
  23. claude_context/skills/evidence_retrieval.py +151 -0
  24. claude_context/skills/grounded_review.py +212 -0
  25. claude_context/synthesis/__init__.py +8 -0
  26. claude_context/synthesis/editor_agent.py +391 -0
  27. claude_context/synthesis/llm_synthesizer.py +153 -0
  28. claude_context/synthesis/logic_explainer.py +235 -0
  29. claude_context/synthesis/multi_review_pipeline.py +717 -0
  30. claude_context/synthesis/prompt_builder.py +439 -0
  31. claude_context/synthesis/providers.py +115 -0
  32. claude_context/synthesis/validators.py +458 -0
  33. code_finder-0.1.0.dist-info/METADATA +823 -0
  34. code_finder-0.1.0.dist-info/RECORD +37 -0
  35. code_finder-0.1.0.dist-info/WHEEL +5 -0
  36. code_finder-0.1.0.dist-info/entry_points.txt +4 -0
  37. code_finder-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,493 @@
1
+ """
2
+ Hybrid Search for Claude Context
3
+
4
+ Combines BM25 (keyword) and vector (semantic) search for optimal code retrieval.
5
+ This gives us the best of both worlds:
6
+ - BM25: Exact matches, variable names, specific terms
7
+ - Vector: Conceptual similarity, related functionality
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ import re
16
+
17
+ import numpy as np
18
+ from rank_bm25 import BM25Okapi
19
+
20
+ from .config import ClaudeContextConfig, MilvusManager
21
+ from .embeddings import LocalEmbeddings
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class SearchResult:
28
+ """Represents a search result with combined scoring"""
29
+ content: str
30
+ file_path: str
31
+ file_name: str
32
+ start_line: int
33
+ end_line: int
34
+ language: str
35
+ chunk_type: str # function, class, method, import_block, etc.
36
+ chunk_name: Optional[str] = None
37
+ parent_context: Optional[str] = None # Full scope chain (e.g., "ClassName > method_name")
38
+
39
+ # Enhanced metadata (from EnhancedASTChunker)
40
+ signature: Optional[str] = None # Full function signature with params and types
41
+ docstring: Optional[str] = None # Extracted docstring
42
+ imports: Optional[str] = None # JSON list of imports used in this chunk
43
+ return_type: Optional[str] = None # Return type annotation
44
+
45
+ # Scoring
46
+ vector_score: float = 0.0 # Cosine similarity (0-1, higher is better)
47
+ bm25_score: float = 0.0 # BM25 score (unbounded, higher is better)
48
+ combined_score: float = 0.0 # Weighted combination
49
+
50
+ # Metadata
51
+ chunk_id: str = ""
52
+ milvus_id: Optional[int] = None
53
+
54
+ def format_result(self, include_scores: bool = False, include_metadata: bool = False) -> str:
55
+ """Format the result for display"""
56
+ # Build header
57
+ header = f"📄 {self.file_name}"
58
+ if self.parent_context:
59
+ header += f" > {self.parent_context}"
60
+ if self.chunk_name:
61
+ header += f" > {self.chunk_name}"
62
+ if self.chunk_type:
63
+ header += f" ({self.chunk_type})"
64
+ header += f" [Lines {self.start_line}-{self.end_line}]"
65
+
66
+ # Add scores if requested
67
+ if include_scores:
68
+ header += f"\n Scores: Vector={self.vector_score:.3f}, BM25={self.bm25_score:.3f}, Combined={self.combined_score:.3f}"
69
+
70
+ # Add enhanced metadata if available and requested
71
+ if include_metadata:
72
+ if self.signature:
73
+ header += f"\n Signature: {self.signature}"
74
+ if self.return_type:
75
+ header += f"\n Returns: {self.return_type}"
76
+ if self.docstring:
77
+ # Show first line of docstring
78
+ first_line = self.docstring.split('\n')[0][:80]
79
+ header += f"\n Doc: {first_line}"
80
+
81
+ # Add content preview (first 3 lines)
82
+ lines = self.content.split('\n')[:3]
83
+ preview = '\n'.join(f" {line}" for line in lines)
84
+ if len(self.content.split('\n')) > 3:
85
+ preview += "\n ..."
86
+
87
+ return f"{header}\n{preview}"
88
+
89
+
90
+ class HybridSearcher:
91
+ """
92
+ Hybrid search combining BM25 and vector search.
93
+
94
+ This class provides:
95
+ 1. BM25 for keyword/exact matching
96
+ 2. Vector search for semantic similarity
97
+ 3. Hybrid ranking combining both scores
98
+ """
99
+
100
+ def __init__(
101
+ self,
102
+ config: ClaudeContextConfig,
103
+ embeddings: LocalEmbeddings,
104
+ milvus_manager: MilvusManager,
105
+ bm25_weight: float = 0.5,
106
+ vector_weight: float = 0.5
107
+ ):
108
+ """
109
+ Initialize the hybrid searcher.
110
+
111
+ Args:
112
+ config: Configuration
113
+ embeddings: Embeddings model
114
+ milvus_manager: Milvus connection manager
115
+ bm25_weight: Weight for BM25 scores (0-1)
116
+ vector_weight: Weight for vector scores (0-1)
117
+ """
118
+ if not config:
119
+ raise ValueError("config is required")
120
+ if not embeddings:
121
+ raise ValueError("embeddings is required")
122
+ if not milvus_manager:
123
+ raise ValueError("milvus_manager is required")
124
+
125
+ # Normalize weights
126
+ total_weight = bm25_weight + vector_weight
127
+ self.bm25_weight = bm25_weight / total_weight
128
+ self.vector_weight = vector_weight / total_weight
129
+
130
+ self.config = config
131
+ self.embeddings = embeddings
132
+ self.milvus_client = milvus_manager.get_client()
133
+ self.collection_name = "code_chunks"
134
+
135
+ # BM25 components (will be built from indexed data)
136
+ self.bm25_index: Optional[BM25Okapi] = None
137
+ self.bm25_documents: List[str] = []
138
+ self.bm25_metadata: List[Dict[str, Any]] = []
139
+
140
+ logger.info(f"HybridSearcher initialized (BM25={self.bm25_weight:.2f}, Vector={self.vector_weight:.2f})")
141
+
142
+ def build_bm25_index(self) -> None:
143
+ """
144
+ Build BM25 index from Milvus data.
145
+
146
+ This loads all documents from Milvus and creates a BM25 index
147
+ for fast keyword searching.
148
+ """
149
+ logger.info("Building BM25 index from Milvus data...")
150
+
151
+ # Check if collection exists
152
+ if not self.milvus_client.has_collection(self.collection_name):
153
+ raise ValueError(f"Collection '{self.collection_name}' does not exist. Index a repository first.")
154
+
155
+ # Query all documents (without vectors to save memory)
156
+ # Note: In production, you might want to paginate this for large collections
157
+ results = self.milvus_client.query(
158
+ collection_name=self.collection_name,
159
+ filter="", # No filter - get everything
160
+ output_fields=["content", "file_path", "file_name", "start_line", "end_line",
161
+ "language", "chunk_type", "chunk_name", "parent_context", "chunk_id",
162
+ "signature", "docstring", "imports", "return_type"], # Enhanced fields
163
+ limit=10000 # Adjust based on your needs
164
+ )
165
+
166
+ if not results:
167
+ logger.warning("No documents found in Milvus")
168
+ return
169
+
170
+ # Prepare documents for BM25
171
+ self.bm25_documents = []
172
+ self.bm25_metadata = []
173
+
174
+ for doc in results:
175
+ # Tokenize content for BM25
176
+ content = doc.get("content", "")
177
+ tokens = self._tokenize_code(content)
178
+ self.bm25_documents.append(tokens)
179
+
180
+ # Store metadata for retrieval (including enhanced fields)
181
+ self.bm25_metadata.append({
182
+ "id": doc.get("id"),
183
+ "content": content,
184
+ "file_path": doc.get("file_path", ""),
185
+ "file_name": doc.get("file_name", ""),
186
+ "start_line": doc.get("start_line", 0),
187
+ "end_line": doc.get("end_line", 0),
188
+ "language": doc.get("language", ""),
189
+ "chunk_type": doc.get("chunk_type", ""),
190
+ "chunk_name": doc.get("chunk_name", ""),
191
+ "parent_context": doc.get("parent_context", ""),
192
+ "chunk_id": doc.get("chunk_id", ""),
193
+ # Enhanced fields
194
+ "signature": doc.get("signature", ""),
195
+ "docstring": doc.get("docstring", ""),
196
+ "imports": doc.get("imports", ""),
197
+ "return_type": doc.get("return_type", "")
198
+ })
199
+
200
+ # Build BM25 index
201
+ self.bm25_index = BM25Okapi(self.bm25_documents)
202
+
203
+ logger.info(f"✅ BM25 index built with {len(self.bm25_documents)} documents")
204
+
205
+ def _tokenize_code(self, text: str) -> List[str]:
206
+ """
207
+ Tokenize code for BM25 indexing.
208
+
209
+ This handles:
210
+ - CamelCase splitting (className -> class, Name)
211
+ - snake_case splitting (get_user -> get, user)
212
+ - Special characters removal
213
+ - Lowercasing for consistency
214
+ """
215
+ # Split on whitespace and special characters
216
+ tokens = re.findall(r'\b\w+\b', text.lower())
217
+
218
+ # Further split camelCase and snake_case
219
+ expanded_tokens = []
220
+ for token in tokens:
221
+ # Split snake_case
222
+ parts = token.split('_')
223
+ for part in parts:
224
+ if part:
225
+ # Split camelCase
226
+ camel_parts = re.findall(r'[a-z]+|[A-Z][a-z]*', part)
227
+ if camel_parts:
228
+ expanded_tokens.extend(camel_parts)
229
+ else:
230
+ expanded_tokens.append(part)
231
+
232
+ return expanded_tokens
233
+
234
+ def search(
235
+ self,
236
+ query: str,
237
+ limit: int = 10,
238
+ filter_chunk_types: Optional[List[str]] = None,
239
+ filter_languages: Optional[List[str]] = None,
240
+ filter_paths: Optional[List[str]] = None,
241
+ rerank: bool = True
242
+ ) -> List[SearchResult]:
243
+ """
244
+ Perform hybrid search combining BM25 and vector search.
245
+
246
+ Args:
247
+ query: Search query
248
+ limit: Maximum number of results
249
+ filter_chunk_types: Filter by chunk types (function, class, etc.)
250
+ filter_languages: Filter by programming languages
251
+ filter_paths: Filter results to files under these directory prefixes
252
+ rerank: Whether to use hybrid reranking
253
+
254
+ Returns:
255
+ List of SearchResult objects sorted by relevance
256
+ """
257
+ if not query.strip():
258
+ logger.warning("Empty search query")
259
+ return []
260
+
261
+ start_time = time.time()
262
+
263
+ # Ensure BM25 index is built
264
+ if self.bm25_index is None:
265
+ self.build_bm25_index()
266
+ if self.bm25_index is None:
267
+ logger.warning("Could not build BM25 index, falling back to vector-only search")
268
+ rerank = False
269
+
270
+ # Over-fetch when post-filters are active so we have enough after filtering
271
+ has_filters = filter_paths or filter_chunk_types or filter_languages
272
+ fetch_limit = limit * 5 if has_filters else limit * 2
273
+
274
+ # 1. Vector Search
275
+ vector_results = self._vector_search(query, fetch_limit)
276
+
277
+ # 2. BM25 Search (if available)
278
+ bm25_results = []
279
+ if self.bm25_index and rerank:
280
+ bm25_results = self._bm25_search(query, fetch_limit)
281
+
282
+ # 3. Combine and rerank
283
+ if rerank and bm25_results:
284
+ results = self._hybrid_rerank(vector_results, bm25_results, fetch_limit)
285
+ else:
286
+ results = vector_results[:fetch_limit]
287
+
288
+ # 4. Apply filters
289
+ if filter_paths:
290
+ # Paths should be absolute; resolve any relative ones against cwd as fallback
291
+ resolved = [str(Path(p).resolve()) if not Path(p).is_absolute() else p
292
+ for p in filter_paths]
293
+ results = [r for r in results
294
+ if any(r.file_path.startswith(p) for p in resolved)]
295
+ if filter_chunk_types:
296
+ results = [r for r in results if r.chunk_type in filter_chunk_types]
297
+ if filter_languages:
298
+ results = [r for r in results if r.language in filter_languages]
299
+
300
+ elapsed = time.time() - start_time
301
+ logger.info(f"Search completed in {elapsed:.3f}s - Found {len(results)} results")
302
+
303
+ return results[:limit]
304
+
305
+ def _vector_search(self, query: str, limit: int) -> List[SearchResult]:
306
+ """Perform vector similarity search"""
307
+ # Generate query embedding
308
+ query_embedding = self.embeddings.embed_texts([query])[0]
309
+
310
+ # Search in Milvus (including enhanced fields)
311
+ search_results = self.milvus_client.search(
312
+ collection_name=self.collection_name,
313
+ data=[query_embedding.tolist()],
314
+ limit=limit,
315
+ output_fields=["content", "file_path", "file_name", "start_line", "end_line",
316
+ "language", "chunk_type", "chunk_name", "parent_context", "chunk_id",
317
+ "signature", "docstring", "imports", "return_type"]
318
+ )
319
+
320
+ results = []
321
+ for hits in search_results:
322
+ for hit in hits:
323
+ # Create SearchResult with enhanced metadata
324
+ result = SearchResult(
325
+ content=hit["entity"].get("content", ""),
326
+ file_path=hit["entity"].get("file_path", ""),
327
+ file_name=hit["entity"].get("file_name", ""),
328
+ start_line=hit["entity"].get("start_line", 0),
329
+ end_line=hit["entity"].get("end_line", 0),
330
+ language=hit["entity"].get("language", ""),
331
+ chunk_type=hit["entity"].get("chunk_type", ""),
332
+ chunk_name=hit["entity"].get("chunk_name"),
333
+ parent_context=hit["entity"].get("parent_context"),
334
+ # Enhanced fields
335
+ signature=hit["entity"].get("signature"),
336
+ docstring=hit["entity"].get("docstring"),
337
+ imports=hit["entity"].get("imports"),
338
+ return_type=hit["entity"].get("return_type"),
339
+ # Scoring
340
+ vector_score=1 - hit["distance"], # Convert distance to similarity
341
+ chunk_id=hit["entity"].get("chunk_id", ""),
342
+ milvus_id=hit.get("id")
343
+ )
344
+ results.append(result)
345
+
346
+ return results
347
+
348
+ def _bm25_search(self, query: str, limit: int) -> List[SearchResult]:
349
+ """Perform BM25 keyword search"""
350
+ if not self.bm25_index:
351
+ return []
352
+
353
+ # Tokenize query
354
+ query_tokens = self._tokenize_code(query)
355
+
356
+ # Get BM25 scores
357
+ scores = self.bm25_index.get_scores(query_tokens)
358
+
359
+ # Get top-k indices
360
+ top_indices = np.argsort(scores)[::-1][:limit]
361
+
362
+ results = []
363
+ for idx in top_indices:
364
+ if scores[idx] > 0: # Only include results with positive scores
365
+ metadata = self.bm25_metadata[idx]
366
+ result = SearchResult(
367
+ content=metadata["content"],
368
+ file_path=metadata["file_path"],
369
+ file_name=metadata["file_name"],
370
+ start_line=metadata["start_line"],
371
+ end_line=metadata["end_line"],
372
+ language=metadata["language"],
373
+ chunk_type=metadata["chunk_type"],
374
+ chunk_name=metadata["chunk_name"],
375
+ parent_context=metadata["parent_context"],
376
+ # Enhanced fields
377
+ signature=metadata.get("signature"),
378
+ docstring=metadata.get("docstring"),
379
+ imports=metadata.get("imports"),
380
+ return_type=metadata.get("return_type"),
381
+ # Scoring
382
+ bm25_score=float(scores[idx]),
383
+ chunk_id=metadata["chunk_id"]
384
+ )
385
+ results.append(result)
386
+
387
+ return results
388
+
389
+ def _hybrid_rerank(
390
+ self,
391
+ vector_results: List[SearchResult],
392
+ bm25_results: List[SearchResult],
393
+ limit: int
394
+ ) -> List[SearchResult]:
395
+ """
396
+ Combine and rerank results from vector and BM25 search.
397
+
398
+ Uses reciprocal rank fusion with weighted scores.
399
+ """
400
+ # Create a map of all unique results
401
+ all_results: Dict[str, SearchResult] = {}
402
+
403
+ # Add vector results
404
+ for i, result in enumerate(vector_results):
405
+ key = f"{result.file_path}:{result.start_line}"
406
+ if key not in all_results:
407
+ all_results[key] = result
408
+ else:
409
+ # Update vector score if we've seen this result
410
+ all_results[key].vector_score = max(
411
+ all_results[key].vector_score,
412
+ result.vector_score
413
+ )
414
+
415
+ # Add/update BM25 results
416
+ for i, result in enumerate(bm25_results):
417
+ key = f"{result.file_path}:{result.start_line}"
418
+ if key in all_results:
419
+ # Update BM25 score for existing result
420
+ all_results[key].bm25_score = result.bm25_score
421
+ else:
422
+ # Add new result from BM25
423
+ all_results[key] = result
424
+
425
+ # Normalize scores and calculate combined score
426
+ # Get max scores for normalization
427
+ max_vector = max((r.vector_score for r in all_results.values()), default=1.0)
428
+ max_bm25 = max((r.bm25_score for r in all_results.values()), default=1.0)
429
+
430
+ for result in all_results.values():
431
+ # Normalize scores to 0-1 range
432
+ norm_vector = result.vector_score / max_vector if max_vector > 0 else 0
433
+ norm_bm25 = result.bm25_score / max_bm25 if max_bm25 > 0 else 0
434
+
435
+ # Calculate weighted combined score
436
+ result.combined_score = (
437
+ self.vector_weight * norm_vector +
438
+ self.bm25_weight * norm_bm25
439
+ )
440
+
441
+ # Sort by combined score
442
+ sorted_results = sorted(
443
+ all_results.values(),
444
+ key=lambda x: x.combined_score,
445
+ reverse=True
446
+ )
447
+
448
+ return sorted_results[:limit]
449
+
450
+ def explain_search(self, query: str, limit: int = 5) -> str:
451
+ """
452
+ Perform search and explain the scoring.
453
+
454
+ Useful for debugging and understanding why certain results rank higher.
455
+ """
456
+ results = self.search(query, limit=limit)
457
+
458
+ explanation = f"🔍 Search Query: '{query}'\n"
459
+ explanation += f"⚖️ Weights: BM25={self.bm25_weight:.2f}, Vector={self.vector_weight:.2f}\n\n"
460
+
461
+ for i, result in enumerate(results, 1):
462
+ explanation += f"{i}. {result.format_result(include_scores=True)}\n\n"
463
+
464
+ return explanation
465
+
466
+
467
+ # Convenience function
468
+ def create_hybrid_searcher(
469
+ config: ClaudeContextConfig,
470
+ embeddings: LocalEmbeddings,
471
+ milvus_manager: MilvusManager,
472
+ bm25_weight: float = 0.5
473
+ ) -> HybridSearcher:
474
+ """
475
+ Create a hybrid searcher with balanced weights.
476
+
477
+ Args:
478
+ config: Configuration
479
+ embeddings: Embeddings model
480
+ milvus_manager: Milvus manager
481
+ bm25_weight: Weight for BM25 (0-1), vector weight will be 1-bm25_weight
482
+
483
+ Returns:
484
+ Configured HybridSearcher
485
+ """
486
+ vector_weight = 1.0 - bm25_weight
487
+ return HybridSearcher(
488
+ config=config,
489
+ embeddings=embeddings,
490
+ milvus_manager=milvus_manager,
491
+ bm25_weight=bm25_weight,
492
+ vector_weight=vector_weight
493
+ )
@@ -0,0 +1,11 @@
1
+ """
2
+ Bridge skills for vibe2doc.
3
+
4
+ Standalone CLI-callable modules that expose vibe2doc's code analysis
5
+ capabilities for use in any documentation workflow.
6
+
7
+ Skills:
8
+ evidence_retrieval: Search codebase for evidence matching a query
9
+ grounded_review: Verify document claims against source code
10
+ api_surface: Extract API surface from source files (deterministic)
11
+ """
@@ -0,0 +1,74 @@
1
+ """
2
+ Shared CLI utilities for bridge skills.
3
+
4
+ Handles argument parsing, logging to stderr, and JSON output routing
5
+ so that stdout stays clean for machine-consumable JSON.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import logging
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any, Dict, Optional
14
+
15
+
16
+ def setup_logging():
17
+ """Configure logging to stderr so stdout stays clean for JSON output."""
18
+ root = logging.getLogger()
19
+ root.handlers.clear()
20
+ handler = logging.StreamHandler(sys.stderr)
21
+ handler.setFormatter(logging.Formatter(
22
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
23
+ ))
24
+ root.addHandler(handler)
25
+ root.setLevel(logging.INFO)
26
+
27
+
28
+ def add_common_args(parser: argparse.ArgumentParser):
29
+ """Add shared arguments for dual-mode CLI (standalone + docs-orchestrator)."""
30
+ parser.add_argument(
31
+ '--ticket', type=str, default=None,
32
+ help='Docs-orchestrator ticket ID (stored in output metadata)'
33
+ )
34
+ parser.add_argument(
35
+ '--base-path', type=str, default=None,
36
+ help='Docs-orchestrator output base path. When set, writes JSON to file instead of stdout.'
37
+ )
38
+ parser.add_argument(
39
+ '--db-path', type=str, default=None,
40
+ help='Override Milvus DB path (default: {repo}/.vibe2doc/index.db)'
41
+ )
42
+ parser.add_argument(
43
+ '--reindex', action='store_true',
44
+ help='Force re-indexing even if an existing index is found'
45
+ )
46
+
47
+
48
+ def output_result(result: Dict[str, Any], args: argparse.Namespace, filename: str = "output.json"):
49
+ """Output JSON result to stdout or file depending on mode.
50
+
51
+ Args:
52
+ result: The result dict to serialize
53
+ args: Parsed CLI args (checks for --base-path and --ticket)
54
+ filename: Output filename when writing to --base-path
55
+ """
56
+ if args.ticket:
57
+ result["ticket"] = args.ticket
58
+
59
+ json_str = json.dumps(result, indent=2, default=str)
60
+
61
+ if args.base_path:
62
+ out_dir = Path(args.base_path)
63
+ out_dir.mkdir(parents=True, exist_ok=True)
64
+ out_path = out_dir / filename
65
+ out_path.write_text(json_str)
66
+ logging.getLogger(__name__).info(f"Output written to {out_path}")
67
+ else:
68
+ print(json_str)
69
+
70
+
71
+ def error_result(error: str, exit_code: int = 1):
72
+ """Print a JSON error object to stdout and exit."""
73
+ print(json.dumps({"error": error, "exit_code": exit_code}))
74
+ sys.exit(exit_code)
@@ -0,0 +1,98 @@
1
+ """
2
+ Index lifecycle manager for bridge skills.
3
+
4
+ Handles the expensive indexing operation with reuse: if an existing
5
+ Milvus DB is found for a repo, reuse it. Otherwise, index from scratch.
6
+ """
7
+
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Dict, Any, Optional, Tuple
11
+
12
+ from ..config import ClaudeContextConfig, MilvusManager
13
+ from ..embeddings import LocalEmbeddings
14
+ from ..indexer import RepositoryIndexer
15
+ from ..search import HybridSearcher, create_hybrid_searcher
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _default_db_path(repo_path: str) -> str:
21
+ """Compute the default index DB path co-located with the repo."""
22
+ return str(Path(repo_path).resolve() / ".vibe2doc" / "index.db")
23
+
24
+
25
+ def ensure_index(
26
+ repo_path: str,
27
+ db_path: Optional[str] = None,
28
+ reindex: bool = False,
29
+ ) -> Tuple[HybridSearcher, Dict[str, Any]]:
30
+ """Ensure a searchable index exists for the given repo.
31
+
32
+ If an index already exists and reindex=False, reuses it.
33
+ Otherwise, runs full indexing (discover, chunk, embed, store).
34
+
35
+ Args:
36
+ repo_path: Path to the repository to index
37
+ db_path: Override Milvus DB path (default: {repo}/.vibe2doc/index.db)
38
+ reindex: Force re-indexing even if index exists
39
+
40
+ Returns:
41
+ (searcher, info) where info contains index metadata
42
+ """
43
+ repo_path = str(Path(repo_path).resolve())
44
+ if db_path is None:
45
+ db_path = _default_db_path(repo_path)
46
+
47
+ # Ensure the DB directory exists
48
+ Path(db_path).parent.mkdir(parents=True, exist_ok=True)
49
+
50
+ db_exists = Path(db_path).exists()
51
+ config = ClaudeContextConfig(db_path=db_path)
52
+ milvus_mgr = MilvusManager(config)
53
+ embeddings = LocalEmbeddings(config.embedding_model)
54
+
55
+ info = {
56
+ "repo_path": repo_path,
57
+ "index_path": db_path,
58
+ "reused_existing": False,
59
+ "chunks_count": 0,
60
+ }
61
+
62
+ if db_exists and not reindex:
63
+ # Reuse existing index
64
+ logger.info(f"Reusing existing index at {db_path}")
65
+ client = milvus_mgr.get_client()
66
+ if client.has_collection(config.collection_name):
67
+ count = client.query(
68
+ collection_name=config.collection_name,
69
+ filter="",
70
+ output_fields=["count(*)"],
71
+ )
72
+ info["chunks_count"] = count[0].get("count(*)", 0) if count else 0
73
+ info["reused_existing"] = True
74
+ else:
75
+ # DB file exists but collection doesn't — need to index
76
+ logger.info("DB exists but collection missing, indexing from scratch")
77
+ indexer = RepositoryIndexer(config, embeddings, milvus_mgr)
78
+ stats = indexer.index_repository(repo_path, show_progress=True)
79
+ info["chunks_count"] = stats.get("chunks_indexed", 0)
80
+ else:
81
+ # Fresh index
82
+ if reindex:
83
+ logger.info(f"Re-indexing {repo_path} (forced)")
84
+ else:
85
+ logger.info(f"No existing index found, indexing {repo_path}")
86
+ indexer = RepositoryIndexer(config, embeddings, milvus_mgr)
87
+ stats = indexer.index_repository(repo_path, show_progress=True)
88
+ info["chunks_count"] = stats.get("chunks_indexed", 0)
89
+
90
+ # Create searcher and build BM25 index
91
+ searcher = create_hybrid_searcher(config, embeddings, milvus_mgr)
92
+ searcher.build_bm25_index()
93
+
94
+ logger.info(
95
+ f"Index ready: {info['chunks_count']} chunks, "
96
+ f"reused={info['reused_existing']}"
97
+ )
98
+ return searcher, info