code-finder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_context/__init__.py +33 -0
- claude_context/agentic_integration.py +309 -0
- claude_context/ast_chunker.py +646 -0
- claude_context/config.py +239 -0
- claude_context/context_manager.py +627 -0
- claude_context/embeddings.py +307 -0
- claude_context/embeddings_interface.py +226 -0
- claude_context/enhanced_ast_chunker.py +1129 -0
- claude_context/explorer.py +951 -0
- claude_context/explorer_with_context.py +1008 -0
- claude_context/indexer.py +893 -0
- claude_context/markdown_chunker.py +421 -0
- claude_context/mode_handler.py +1774 -0
- claude_context/query_metrics.py +164 -0
- claude_context/question_generator.py +800 -0
- claude_context/readme_extractor.py +485 -0
- claude_context/repository_adapter.py +399 -0
- claude_context/search.py +493 -0
- claude_context/skills/__init__.py +11 -0
- claude_context/skills/_cli_common.py +74 -0
- claude_context/skills/_index_manager.py +98 -0
- claude_context/skills/api_surface.py +219 -0
- claude_context/skills/evidence_retrieval.py +151 -0
- claude_context/skills/grounded_review.py +212 -0
- claude_context/synthesis/__init__.py +8 -0
- claude_context/synthesis/editor_agent.py +391 -0
- claude_context/synthesis/llm_synthesizer.py +153 -0
- claude_context/synthesis/logic_explainer.py +235 -0
- claude_context/synthesis/multi_review_pipeline.py +717 -0
- claude_context/synthesis/prompt_builder.py +439 -0
- claude_context/synthesis/providers.py +115 -0
- claude_context/synthesis/validators.py +458 -0
- code_finder-0.1.0.dist-info/METADATA +823 -0
- code_finder-0.1.0.dist-info/RECORD +37 -0
- code_finder-0.1.0.dist-info/WHEEL +5 -0
- code_finder-0.1.0.dist-info/entry_points.txt +4 -0
- code_finder-0.1.0.dist-info/top_level.txt +1 -0
claude_context/search.py
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hybrid Search for Claude Context
|
|
3
|
+
|
|
4
|
+
Combines BM25 (keyword) and vector (semantic) search for optimal code retrieval.
|
|
5
|
+
This gives us the best of both worlds:
|
|
6
|
+
- BM25: Exact matches, variable names, specific terms
|
|
7
|
+
- Vector: Conceptual similarity, related functionality
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import time
|
|
12
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from rank_bm25 import BM25Okapi
|
|
19
|
+
|
|
20
|
+
from .config import ClaudeContextConfig, MilvusManager
|
|
21
|
+
from .embeddings import LocalEmbeddings
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class SearchResult:
|
|
28
|
+
"""Represents a search result with combined scoring"""
|
|
29
|
+
content: str
|
|
30
|
+
file_path: str
|
|
31
|
+
file_name: str
|
|
32
|
+
start_line: int
|
|
33
|
+
end_line: int
|
|
34
|
+
language: str
|
|
35
|
+
chunk_type: str # function, class, method, import_block, etc.
|
|
36
|
+
chunk_name: Optional[str] = None
|
|
37
|
+
parent_context: Optional[str] = None # Full scope chain (e.g., "ClassName > method_name")
|
|
38
|
+
|
|
39
|
+
# Enhanced metadata (from EnhancedASTChunker)
|
|
40
|
+
signature: Optional[str] = None # Full function signature with params and types
|
|
41
|
+
docstring: Optional[str] = None # Extracted docstring
|
|
42
|
+
imports: Optional[str] = None # JSON list of imports used in this chunk
|
|
43
|
+
return_type: Optional[str] = None # Return type annotation
|
|
44
|
+
|
|
45
|
+
# Scoring
|
|
46
|
+
vector_score: float = 0.0 # Cosine similarity (0-1, higher is better)
|
|
47
|
+
bm25_score: float = 0.0 # BM25 score (unbounded, higher is better)
|
|
48
|
+
combined_score: float = 0.0 # Weighted combination
|
|
49
|
+
|
|
50
|
+
# Metadata
|
|
51
|
+
chunk_id: str = ""
|
|
52
|
+
milvus_id: Optional[int] = None
|
|
53
|
+
|
|
54
|
+
def format_result(self, include_scores: bool = False, include_metadata: bool = False) -> str:
|
|
55
|
+
"""Format the result for display"""
|
|
56
|
+
# Build header
|
|
57
|
+
header = f"📄 {self.file_name}"
|
|
58
|
+
if self.parent_context:
|
|
59
|
+
header += f" > {self.parent_context}"
|
|
60
|
+
if self.chunk_name:
|
|
61
|
+
header += f" > {self.chunk_name}"
|
|
62
|
+
if self.chunk_type:
|
|
63
|
+
header += f" ({self.chunk_type})"
|
|
64
|
+
header += f" [Lines {self.start_line}-{self.end_line}]"
|
|
65
|
+
|
|
66
|
+
# Add scores if requested
|
|
67
|
+
if include_scores:
|
|
68
|
+
header += f"\n Scores: Vector={self.vector_score:.3f}, BM25={self.bm25_score:.3f}, Combined={self.combined_score:.3f}"
|
|
69
|
+
|
|
70
|
+
# Add enhanced metadata if available and requested
|
|
71
|
+
if include_metadata:
|
|
72
|
+
if self.signature:
|
|
73
|
+
header += f"\n Signature: {self.signature}"
|
|
74
|
+
if self.return_type:
|
|
75
|
+
header += f"\n Returns: {self.return_type}"
|
|
76
|
+
if self.docstring:
|
|
77
|
+
# Show first line of docstring
|
|
78
|
+
first_line = self.docstring.split('\n')[0][:80]
|
|
79
|
+
header += f"\n Doc: {first_line}"
|
|
80
|
+
|
|
81
|
+
# Add content preview (first 3 lines)
|
|
82
|
+
lines = self.content.split('\n')[:3]
|
|
83
|
+
preview = '\n'.join(f" {line}" for line in lines)
|
|
84
|
+
if len(self.content.split('\n')) > 3:
|
|
85
|
+
preview += "\n ..."
|
|
86
|
+
|
|
87
|
+
return f"{header}\n{preview}"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class HybridSearcher:
|
|
91
|
+
"""
|
|
92
|
+
Hybrid search combining BM25 and vector search.
|
|
93
|
+
|
|
94
|
+
This class provides:
|
|
95
|
+
1. BM25 for keyword/exact matching
|
|
96
|
+
2. Vector search for semantic similarity
|
|
97
|
+
3. Hybrid ranking combining both scores
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
config: ClaudeContextConfig,
|
|
103
|
+
embeddings: LocalEmbeddings,
|
|
104
|
+
milvus_manager: MilvusManager,
|
|
105
|
+
bm25_weight: float = 0.5,
|
|
106
|
+
vector_weight: float = 0.5
|
|
107
|
+
):
|
|
108
|
+
"""
|
|
109
|
+
Initialize the hybrid searcher.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
config: Configuration
|
|
113
|
+
embeddings: Embeddings model
|
|
114
|
+
milvus_manager: Milvus connection manager
|
|
115
|
+
bm25_weight: Weight for BM25 scores (0-1)
|
|
116
|
+
vector_weight: Weight for vector scores (0-1)
|
|
117
|
+
"""
|
|
118
|
+
if not config:
|
|
119
|
+
raise ValueError("config is required")
|
|
120
|
+
if not embeddings:
|
|
121
|
+
raise ValueError("embeddings is required")
|
|
122
|
+
if not milvus_manager:
|
|
123
|
+
raise ValueError("milvus_manager is required")
|
|
124
|
+
|
|
125
|
+
# Normalize weights
|
|
126
|
+
total_weight = bm25_weight + vector_weight
|
|
127
|
+
self.bm25_weight = bm25_weight / total_weight
|
|
128
|
+
self.vector_weight = vector_weight / total_weight
|
|
129
|
+
|
|
130
|
+
self.config = config
|
|
131
|
+
self.embeddings = embeddings
|
|
132
|
+
self.milvus_client = milvus_manager.get_client()
|
|
133
|
+
self.collection_name = "code_chunks"
|
|
134
|
+
|
|
135
|
+
# BM25 components (will be built from indexed data)
|
|
136
|
+
self.bm25_index: Optional[BM25Okapi] = None
|
|
137
|
+
self.bm25_documents: List[str] = []
|
|
138
|
+
self.bm25_metadata: List[Dict[str, Any]] = []
|
|
139
|
+
|
|
140
|
+
logger.info(f"HybridSearcher initialized (BM25={self.bm25_weight:.2f}, Vector={self.vector_weight:.2f})")
|
|
141
|
+
|
|
142
|
+
def build_bm25_index(self) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Build BM25 index from Milvus data.
|
|
145
|
+
|
|
146
|
+
This loads all documents from Milvus and creates a BM25 index
|
|
147
|
+
for fast keyword searching.
|
|
148
|
+
"""
|
|
149
|
+
logger.info("Building BM25 index from Milvus data...")
|
|
150
|
+
|
|
151
|
+
# Check if collection exists
|
|
152
|
+
if not self.milvus_client.has_collection(self.collection_name):
|
|
153
|
+
raise ValueError(f"Collection '{self.collection_name}' does not exist. Index a repository first.")
|
|
154
|
+
|
|
155
|
+
# Query all documents (without vectors to save memory)
|
|
156
|
+
# Note: In production, you might want to paginate this for large collections
|
|
157
|
+
results = self.milvus_client.query(
|
|
158
|
+
collection_name=self.collection_name,
|
|
159
|
+
filter="", # No filter - get everything
|
|
160
|
+
output_fields=["content", "file_path", "file_name", "start_line", "end_line",
|
|
161
|
+
"language", "chunk_type", "chunk_name", "parent_context", "chunk_id",
|
|
162
|
+
"signature", "docstring", "imports", "return_type"], # Enhanced fields
|
|
163
|
+
limit=10000 # Adjust based on your needs
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if not results:
|
|
167
|
+
logger.warning("No documents found in Milvus")
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
# Prepare documents for BM25
|
|
171
|
+
self.bm25_documents = []
|
|
172
|
+
self.bm25_metadata = []
|
|
173
|
+
|
|
174
|
+
for doc in results:
|
|
175
|
+
# Tokenize content for BM25
|
|
176
|
+
content = doc.get("content", "")
|
|
177
|
+
tokens = self._tokenize_code(content)
|
|
178
|
+
self.bm25_documents.append(tokens)
|
|
179
|
+
|
|
180
|
+
# Store metadata for retrieval (including enhanced fields)
|
|
181
|
+
self.bm25_metadata.append({
|
|
182
|
+
"id": doc.get("id"),
|
|
183
|
+
"content": content,
|
|
184
|
+
"file_path": doc.get("file_path", ""),
|
|
185
|
+
"file_name": doc.get("file_name", ""),
|
|
186
|
+
"start_line": doc.get("start_line", 0),
|
|
187
|
+
"end_line": doc.get("end_line", 0),
|
|
188
|
+
"language": doc.get("language", ""),
|
|
189
|
+
"chunk_type": doc.get("chunk_type", ""),
|
|
190
|
+
"chunk_name": doc.get("chunk_name", ""),
|
|
191
|
+
"parent_context": doc.get("parent_context", ""),
|
|
192
|
+
"chunk_id": doc.get("chunk_id", ""),
|
|
193
|
+
# Enhanced fields
|
|
194
|
+
"signature": doc.get("signature", ""),
|
|
195
|
+
"docstring": doc.get("docstring", ""),
|
|
196
|
+
"imports": doc.get("imports", ""),
|
|
197
|
+
"return_type": doc.get("return_type", "")
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
# Build BM25 index
|
|
201
|
+
self.bm25_index = BM25Okapi(self.bm25_documents)
|
|
202
|
+
|
|
203
|
+
logger.info(f"✅ BM25 index built with {len(self.bm25_documents)} documents")
|
|
204
|
+
|
|
205
|
+
def _tokenize_code(self, text: str) -> List[str]:
|
|
206
|
+
"""
|
|
207
|
+
Tokenize code for BM25 indexing.
|
|
208
|
+
|
|
209
|
+
This handles:
|
|
210
|
+
- CamelCase splitting (className -> class, Name)
|
|
211
|
+
- snake_case splitting (get_user -> get, user)
|
|
212
|
+
- Special characters removal
|
|
213
|
+
- Lowercasing for consistency
|
|
214
|
+
"""
|
|
215
|
+
# Split on whitespace and special characters
|
|
216
|
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
|
217
|
+
|
|
218
|
+
# Further split camelCase and snake_case
|
|
219
|
+
expanded_tokens = []
|
|
220
|
+
for token in tokens:
|
|
221
|
+
# Split snake_case
|
|
222
|
+
parts = token.split('_')
|
|
223
|
+
for part in parts:
|
|
224
|
+
if part:
|
|
225
|
+
# Split camelCase
|
|
226
|
+
camel_parts = re.findall(r'[a-z]+|[A-Z][a-z]*', part)
|
|
227
|
+
if camel_parts:
|
|
228
|
+
expanded_tokens.extend(camel_parts)
|
|
229
|
+
else:
|
|
230
|
+
expanded_tokens.append(part)
|
|
231
|
+
|
|
232
|
+
return expanded_tokens
|
|
233
|
+
|
|
234
|
+
def search(
|
|
235
|
+
self,
|
|
236
|
+
query: str,
|
|
237
|
+
limit: int = 10,
|
|
238
|
+
filter_chunk_types: Optional[List[str]] = None,
|
|
239
|
+
filter_languages: Optional[List[str]] = None,
|
|
240
|
+
filter_paths: Optional[List[str]] = None,
|
|
241
|
+
rerank: bool = True
|
|
242
|
+
) -> List[SearchResult]:
|
|
243
|
+
"""
|
|
244
|
+
Perform hybrid search combining BM25 and vector search.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
query: Search query
|
|
248
|
+
limit: Maximum number of results
|
|
249
|
+
filter_chunk_types: Filter by chunk types (function, class, etc.)
|
|
250
|
+
filter_languages: Filter by programming languages
|
|
251
|
+
filter_paths: Filter results to files under these directory prefixes
|
|
252
|
+
rerank: Whether to use hybrid reranking
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
List of SearchResult objects sorted by relevance
|
|
256
|
+
"""
|
|
257
|
+
if not query.strip():
|
|
258
|
+
logger.warning("Empty search query")
|
|
259
|
+
return []
|
|
260
|
+
|
|
261
|
+
start_time = time.time()
|
|
262
|
+
|
|
263
|
+
# Ensure BM25 index is built
|
|
264
|
+
if self.bm25_index is None:
|
|
265
|
+
self.build_bm25_index()
|
|
266
|
+
if self.bm25_index is None:
|
|
267
|
+
logger.warning("Could not build BM25 index, falling back to vector-only search")
|
|
268
|
+
rerank = False
|
|
269
|
+
|
|
270
|
+
# Over-fetch when post-filters are active so we have enough after filtering
|
|
271
|
+
has_filters = filter_paths or filter_chunk_types or filter_languages
|
|
272
|
+
fetch_limit = limit * 5 if has_filters else limit * 2
|
|
273
|
+
|
|
274
|
+
# 1. Vector Search
|
|
275
|
+
vector_results = self._vector_search(query, fetch_limit)
|
|
276
|
+
|
|
277
|
+
# 2. BM25 Search (if available)
|
|
278
|
+
bm25_results = []
|
|
279
|
+
if self.bm25_index and rerank:
|
|
280
|
+
bm25_results = self._bm25_search(query, fetch_limit)
|
|
281
|
+
|
|
282
|
+
# 3. Combine and rerank
|
|
283
|
+
if rerank and bm25_results:
|
|
284
|
+
results = self._hybrid_rerank(vector_results, bm25_results, fetch_limit)
|
|
285
|
+
else:
|
|
286
|
+
results = vector_results[:fetch_limit]
|
|
287
|
+
|
|
288
|
+
# 4. Apply filters
|
|
289
|
+
if filter_paths:
|
|
290
|
+
# Paths should be absolute; resolve any relative ones against cwd as fallback
|
|
291
|
+
resolved = [str(Path(p).resolve()) if not Path(p).is_absolute() else p
|
|
292
|
+
for p in filter_paths]
|
|
293
|
+
results = [r for r in results
|
|
294
|
+
if any(r.file_path.startswith(p) for p in resolved)]
|
|
295
|
+
if filter_chunk_types:
|
|
296
|
+
results = [r for r in results if r.chunk_type in filter_chunk_types]
|
|
297
|
+
if filter_languages:
|
|
298
|
+
results = [r for r in results if r.language in filter_languages]
|
|
299
|
+
|
|
300
|
+
elapsed = time.time() - start_time
|
|
301
|
+
logger.info(f"Search completed in {elapsed:.3f}s - Found {len(results)} results")
|
|
302
|
+
|
|
303
|
+
return results[:limit]
|
|
304
|
+
|
|
305
|
+
def _vector_search(self, query: str, limit: int) -> List[SearchResult]:
|
|
306
|
+
"""Perform vector similarity search"""
|
|
307
|
+
# Generate query embedding
|
|
308
|
+
query_embedding = self.embeddings.embed_texts([query])[0]
|
|
309
|
+
|
|
310
|
+
# Search in Milvus (including enhanced fields)
|
|
311
|
+
search_results = self.milvus_client.search(
|
|
312
|
+
collection_name=self.collection_name,
|
|
313
|
+
data=[query_embedding.tolist()],
|
|
314
|
+
limit=limit,
|
|
315
|
+
output_fields=["content", "file_path", "file_name", "start_line", "end_line",
|
|
316
|
+
"language", "chunk_type", "chunk_name", "parent_context", "chunk_id",
|
|
317
|
+
"signature", "docstring", "imports", "return_type"]
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
results = []
|
|
321
|
+
for hits in search_results:
|
|
322
|
+
for hit in hits:
|
|
323
|
+
# Create SearchResult with enhanced metadata
|
|
324
|
+
result = SearchResult(
|
|
325
|
+
content=hit["entity"].get("content", ""),
|
|
326
|
+
file_path=hit["entity"].get("file_path", ""),
|
|
327
|
+
file_name=hit["entity"].get("file_name", ""),
|
|
328
|
+
start_line=hit["entity"].get("start_line", 0),
|
|
329
|
+
end_line=hit["entity"].get("end_line", 0),
|
|
330
|
+
language=hit["entity"].get("language", ""),
|
|
331
|
+
chunk_type=hit["entity"].get("chunk_type", ""),
|
|
332
|
+
chunk_name=hit["entity"].get("chunk_name"),
|
|
333
|
+
parent_context=hit["entity"].get("parent_context"),
|
|
334
|
+
# Enhanced fields
|
|
335
|
+
signature=hit["entity"].get("signature"),
|
|
336
|
+
docstring=hit["entity"].get("docstring"),
|
|
337
|
+
imports=hit["entity"].get("imports"),
|
|
338
|
+
return_type=hit["entity"].get("return_type"),
|
|
339
|
+
# Scoring
|
|
340
|
+
vector_score=1 - hit["distance"], # Convert distance to similarity
|
|
341
|
+
chunk_id=hit["entity"].get("chunk_id", ""),
|
|
342
|
+
milvus_id=hit.get("id")
|
|
343
|
+
)
|
|
344
|
+
results.append(result)
|
|
345
|
+
|
|
346
|
+
return results
|
|
347
|
+
|
|
348
|
+
def _bm25_search(self, query: str, limit: int) -> List[SearchResult]:
|
|
349
|
+
"""Perform BM25 keyword search"""
|
|
350
|
+
if not self.bm25_index:
|
|
351
|
+
return []
|
|
352
|
+
|
|
353
|
+
# Tokenize query
|
|
354
|
+
query_tokens = self._tokenize_code(query)
|
|
355
|
+
|
|
356
|
+
# Get BM25 scores
|
|
357
|
+
scores = self.bm25_index.get_scores(query_tokens)
|
|
358
|
+
|
|
359
|
+
# Get top-k indices
|
|
360
|
+
top_indices = np.argsort(scores)[::-1][:limit]
|
|
361
|
+
|
|
362
|
+
results = []
|
|
363
|
+
for idx in top_indices:
|
|
364
|
+
if scores[idx] > 0: # Only include results with positive scores
|
|
365
|
+
metadata = self.bm25_metadata[idx]
|
|
366
|
+
result = SearchResult(
|
|
367
|
+
content=metadata["content"],
|
|
368
|
+
file_path=metadata["file_path"],
|
|
369
|
+
file_name=metadata["file_name"],
|
|
370
|
+
start_line=metadata["start_line"],
|
|
371
|
+
end_line=metadata["end_line"],
|
|
372
|
+
language=metadata["language"],
|
|
373
|
+
chunk_type=metadata["chunk_type"],
|
|
374
|
+
chunk_name=metadata["chunk_name"],
|
|
375
|
+
parent_context=metadata["parent_context"],
|
|
376
|
+
# Enhanced fields
|
|
377
|
+
signature=metadata.get("signature"),
|
|
378
|
+
docstring=metadata.get("docstring"),
|
|
379
|
+
imports=metadata.get("imports"),
|
|
380
|
+
return_type=metadata.get("return_type"),
|
|
381
|
+
# Scoring
|
|
382
|
+
bm25_score=float(scores[idx]),
|
|
383
|
+
chunk_id=metadata["chunk_id"]
|
|
384
|
+
)
|
|
385
|
+
results.append(result)
|
|
386
|
+
|
|
387
|
+
return results
|
|
388
|
+
|
|
389
|
+
def _hybrid_rerank(
|
|
390
|
+
self,
|
|
391
|
+
vector_results: List[SearchResult],
|
|
392
|
+
bm25_results: List[SearchResult],
|
|
393
|
+
limit: int
|
|
394
|
+
) -> List[SearchResult]:
|
|
395
|
+
"""
|
|
396
|
+
Combine and rerank results from vector and BM25 search.
|
|
397
|
+
|
|
398
|
+
Uses reciprocal rank fusion with weighted scores.
|
|
399
|
+
"""
|
|
400
|
+
# Create a map of all unique results
|
|
401
|
+
all_results: Dict[str, SearchResult] = {}
|
|
402
|
+
|
|
403
|
+
# Add vector results
|
|
404
|
+
for i, result in enumerate(vector_results):
|
|
405
|
+
key = f"{result.file_path}:{result.start_line}"
|
|
406
|
+
if key not in all_results:
|
|
407
|
+
all_results[key] = result
|
|
408
|
+
else:
|
|
409
|
+
# Update vector score if we've seen this result
|
|
410
|
+
all_results[key].vector_score = max(
|
|
411
|
+
all_results[key].vector_score,
|
|
412
|
+
result.vector_score
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# Add/update BM25 results
|
|
416
|
+
for i, result in enumerate(bm25_results):
|
|
417
|
+
key = f"{result.file_path}:{result.start_line}"
|
|
418
|
+
if key in all_results:
|
|
419
|
+
# Update BM25 score for existing result
|
|
420
|
+
all_results[key].bm25_score = result.bm25_score
|
|
421
|
+
else:
|
|
422
|
+
# Add new result from BM25
|
|
423
|
+
all_results[key] = result
|
|
424
|
+
|
|
425
|
+
# Normalize scores and calculate combined score
|
|
426
|
+
# Get max scores for normalization
|
|
427
|
+
max_vector = max((r.vector_score for r in all_results.values()), default=1.0)
|
|
428
|
+
max_bm25 = max((r.bm25_score for r in all_results.values()), default=1.0)
|
|
429
|
+
|
|
430
|
+
for result in all_results.values():
|
|
431
|
+
# Normalize scores to 0-1 range
|
|
432
|
+
norm_vector = result.vector_score / max_vector if max_vector > 0 else 0
|
|
433
|
+
norm_bm25 = result.bm25_score / max_bm25 if max_bm25 > 0 else 0
|
|
434
|
+
|
|
435
|
+
# Calculate weighted combined score
|
|
436
|
+
result.combined_score = (
|
|
437
|
+
self.vector_weight * norm_vector +
|
|
438
|
+
self.bm25_weight * norm_bm25
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Sort by combined score
|
|
442
|
+
sorted_results = sorted(
|
|
443
|
+
all_results.values(),
|
|
444
|
+
key=lambda x: x.combined_score,
|
|
445
|
+
reverse=True
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
return sorted_results[:limit]
|
|
449
|
+
|
|
450
|
+
def explain_search(self, query: str, limit: int = 5) -> str:
|
|
451
|
+
"""
|
|
452
|
+
Perform search and explain the scoring.
|
|
453
|
+
|
|
454
|
+
Useful for debugging and understanding why certain results rank higher.
|
|
455
|
+
"""
|
|
456
|
+
results = self.search(query, limit=limit)
|
|
457
|
+
|
|
458
|
+
explanation = f"🔍 Search Query: '{query}'\n"
|
|
459
|
+
explanation += f"⚖️ Weights: BM25={self.bm25_weight:.2f}, Vector={self.vector_weight:.2f}\n\n"
|
|
460
|
+
|
|
461
|
+
for i, result in enumerate(results, 1):
|
|
462
|
+
explanation += f"{i}. {result.format_result(include_scores=True)}\n\n"
|
|
463
|
+
|
|
464
|
+
return explanation
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
# Convenience function
|
|
468
|
+
def create_hybrid_searcher(
|
|
469
|
+
config: ClaudeContextConfig,
|
|
470
|
+
embeddings: LocalEmbeddings,
|
|
471
|
+
milvus_manager: MilvusManager,
|
|
472
|
+
bm25_weight: float = 0.5
|
|
473
|
+
) -> HybridSearcher:
|
|
474
|
+
"""
|
|
475
|
+
Create a hybrid searcher with balanced weights.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
config: Configuration
|
|
479
|
+
embeddings: Embeddings model
|
|
480
|
+
milvus_manager: Milvus manager
|
|
481
|
+
bm25_weight: Weight for BM25 (0-1), vector weight will be 1-bm25_weight
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
Configured HybridSearcher
|
|
485
|
+
"""
|
|
486
|
+
vector_weight = 1.0 - bm25_weight
|
|
487
|
+
return HybridSearcher(
|
|
488
|
+
config=config,
|
|
489
|
+
embeddings=embeddings,
|
|
490
|
+
milvus_manager=milvus_manager,
|
|
491
|
+
bm25_weight=bm25_weight,
|
|
492
|
+
vector_weight=vector_weight
|
|
493
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Bridge skills for vibe2doc.
|
|
3
|
+
|
|
4
|
+
Standalone CLI-callable modules that expose vibe2doc's code analysis
|
|
5
|
+
capabilities for use in any documentation workflow.
|
|
6
|
+
|
|
7
|
+
Skills:
|
|
8
|
+
evidence_retrieval: Search codebase for evidence matching a query
|
|
9
|
+
grounded_review: Verify document claims against source code
|
|
10
|
+
api_surface: Extract API surface from source files (deterministic)
|
|
11
|
+
"""
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared CLI utilities for bridge skills.
|
|
3
|
+
|
|
4
|
+
Handles argument parsing, logging to stderr, and JSON output routing
|
|
5
|
+
so that stdout stays clean for machine-consumable JSON.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Dict, Optional
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def setup_logging():
|
|
17
|
+
"""Configure logging to stderr so stdout stays clean for JSON output."""
|
|
18
|
+
root = logging.getLogger()
|
|
19
|
+
root.handlers.clear()
|
|
20
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
21
|
+
handler.setFormatter(logging.Formatter(
|
|
22
|
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
23
|
+
))
|
|
24
|
+
root.addHandler(handler)
|
|
25
|
+
root.setLevel(logging.INFO)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def add_common_args(parser: argparse.ArgumentParser):
|
|
29
|
+
"""Add shared arguments for dual-mode CLI (standalone + docs-orchestrator)."""
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
'--ticket', type=str, default=None,
|
|
32
|
+
help='Docs-orchestrator ticket ID (stored in output metadata)'
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
'--base-path', type=str, default=None,
|
|
36
|
+
help='Docs-orchestrator output base path. When set, writes JSON to file instead of stdout.'
|
|
37
|
+
)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
'--db-path', type=str, default=None,
|
|
40
|
+
help='Override Milvus DB path (default: {repo}/.vibe2doc/index.db)'
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
'--reindex', action='store_true',
|
|
44
|
+
help='Force re-indexing even if an existing index is found'
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def output_result(result: Dict[str, Any], args: argparse.Namespace, filename: str = "output.json"):
|
|
49
|
+
"""Output JSON result to stdout or file depending on mode.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
result: The result dict to serialize
|
|
53
|
+
args: Parsed CLI args (checks for --base-path and --ticket)
|
|
54
|
+
filename: Output filename when writing to --base-path
|
|
55
|
+
"""
|
|
56
|
+
if args.ticket:
|
|
57
|
+
result["ticket"] = args.ticket
|
|
58
|
+
|
|
59
|
+
json_str = json.dumps(result, indent=2, default=str)
|
|
60
|
+
|
|
61
|
+
if args.base_path:
|
|
62
|
+
out_dir = Path(args.base_path)
|
|
63
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
out_path = out_dir / filename
|
|
65
|
+
out_path.write_text(json_str)
|
|
66
|
+
logging.getLogger(__name__).info(f"Output written to {out_path}")
|
|
67
|
+
else:
|
|
68
|
+
print(json_str)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def error_result(error: str, exit_code: int = 1):
|
|
72
|
+
"""Print a JSON error object to stdout and exit."""
|
|
73
|
+
print(json.dumps({"error": error, "exit_code": exit_code}))
|
|
74
|
+
sys.exit(exit_code)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Index lifecycle manager for bridge skills.
|
|
3
|
+
|
|
4
|
+
Handles the expensive indexing operation with reuse: if an existing
|
|
5
|
+
Milvus DB is found for a repo, reuse it. Otherwise, index from scratch.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, Any, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
from ..config import ClaudeContextConfig, MilvusManager
|
|
13
|
+
from ..embeddings import LocalEmbeddings
|
|
14
|
+
from ..indexer import RepositoryIndexer
|
|
15
|
+
from ..search import HybridSearcher, create_hybrid_searcher
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _default_db_path(repo_path: str) -> str:
|
|
21
|
+
"""Compute the default index DB path co-located with the repo."""
|
|
22
|
+
return str(Path(repo_path).resolve() / ".vibe2doc" / "index.db")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def ensure_index(
|
|
26
|
+
repo_path: str,
|
|
27
|
+
db_path: Optional[str] = None,
|
|
28
|
+
reindex: bool = False,
|
|
29
|
+
) -> Tuple[HybridSearcher, Dict[str, Any]]:
|
|
30
|
+
"""Ensure a searchable index exists for the given repo.
|
|
31
|
+
|
|
32
|
+
If an index already exists and reindex=False, reuses it.
|
|
33
|
+
Otherwise, runs full indexing (discover, chunk, embed, store).
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
repo_path: Path to the repository to index
|
|
37
|
+
db_path: Override Milvus DB path (default: {repo}/.vibe2doc/index.db)
|
|
38
|
+
reindex: Force re-indexing even if index exists
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
(searcher, info) where info contains index metadata
|
|
42
|
+
"""
|
|
43
|
+
repo_path = str(Path(repo_path).resolve())
|
|
44
|
+
if db_path is None:
|
|
45
|
+
db_path = _default_db_path(repo_path)
|
|
46
|
+
|
|
47
|
+
# Ensure the DB directory exists
|
|
48
|
+
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
|
|
50
|
+
db_exists = Path(db_path).exists()
|
|
51
|
+
config = ClaudeContextConfig(db_path=db_path)
|
|
52
|
+
milvus_mgr = MilvusManager(config)
|
|
53
|
+
embeddings = LocalEmbeddings(config.embedding_model)
|
|
54
|
+
|
|
55
|
+
info = {
|
|
56
|
+
"repo_path": repo_path,
|
|
57
|
+
"index_path": db_path,
|
|
58
|
+
"reused_existing": False,
|
|
59
|
+
"chunks_count": 0,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if db_exists and not reindex:
|
|
63
|
+
# Reuse existing index
|
|
64
|
+
logger.info(f"Reusing existing index at {db_path}")
|
|
65
|
+
client = milvus_mgr.get_client()
|
|
66
|
+
if client.has_collection(config.collection_name):
|
|
67
|
+
count = client.query(
|
|
68
|
+
collection_name=config.collection_name,
|
|
69
|
+
filter="",
|
|
70
|
+
output_fields=["count(*)"],
|
|
71
|
+
)
|
|
72
|
+
info["chunks_count"] = count[0].get("count(*)", 0) if count else 0
|
|
73
|
+
info["reused_existing"] = True
|
|
74
|
+
else:
|
|
75
|
+
# DB file exists but collection doesn't — need to index
|
|
76
|
+
logger.info("DB exists but collection missing, indexing from scratch")
|
|
77
|
+
indexer = RepositoryIndexer(config, embeddings, milvus_mgr)
|
|
78
|
+
stats = indexer.index_repository(repo_path, show_progress=True)
|
|
79
|
+
info["chunks_count"] = stats.get("chunks_indexed", 0)
|
|
80
|
+
else:
|
|
81
|
+
# Fresh index
|
|
82
|
+
if reindex:
|
|
83
|
+
logger.info(f"Re-indexing {repo_path} (forced)")
|
|
84
|
+
else:
|
|
85
|
+
logger.info(f"No existing index found, indexing {repo_path}")
|
|
86
|
+
indexer = RepositoryIndexer(config, embeddings, milvus_mgr)
|
|
87
|
+
stats = indexer.index_repository(repo_path, show_progress=True)
|
|
88
|
+
info["chunks_count"] = stats.get("chunks_indexed", 0)
|
|
89
|
+
|
|
90
|
+
# Create searcher and build BM25 index
|
|
91
|
+
searcher = create_hybrid_searcher(config, embeddings, milvus_mgr)
|
|
92
|
+
searcher.build_bm25_index()
|
|
93
|
+
|
|
94
|
+
logger.info(
|
|
95
|
+
f"Index ready: {info['chunks_count']} chunks, "
|
|
96
|
+
f"reused={info['reused_existing']}"
|
|
97
|
+
)
|
|
98
|
+
return searcher, info
|