mcp-vector-search 0.0.3__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (49) hide show
  1. mcp_vector_search/__init__.py +3 -2
  2. mcp_vector_search/cli/commands/auto_index.py +397 -0
  3. mcp_vector_search/cli/commands/config.py +88 -40
  4. mcp_vector_search/cli/commands/index.py +198 -52
  5. mcp_vector_search/cli/commands/init.py +472 -58
  6. mcp_vector_search/cli/commands/install.py +284 -0
  7. mcp_vector_search/cli/commands/mcp.py +495 -0
  8. mcp_vector_search/cli/commands/search.py +241 -87
  9. mcp_vector_search/cli/commands/status.py +184 -58
  10. mcp_vector_search/cli/commands/watch.py +34 -35
  11. mcp_vector_search/cli/didyoumean.py +184 -0
  12. mcp_vector_search/cli/export.py +320 -0
  13. mcp_vector_search/cli/history.py +292 -0
  14. mcp_vector_search/cli/interactive.py +342 -0
  15. mcp_vector_search/cli/main.py +163 -26
  16. mcp_vector_search/cli/output.py +63 -45
  17. mcp_vector_search/config/defaults.py +50 -36
  18. mcp_vector_search/config/settings.py +49 -35
  19. mcp_vector_search/core/auto_indexer.py +298 -0
  20. mcp_vector_search/core/connection_pool.py +322 -0
  21. mcp_vector_search/core/database.py +335 -25
  22. mcp_vector_search/core/embeddings.py +73 -29
  23. mcp_vector_search/core/exceptions.py +19 -2
  24. mcp_vector_search/core/factory.py +310 -0
  25. mcp_vector_search/core/git_hooks.py +345 -0
  26. mcp_vector_search/core/indexer.py +237 -73
  27. mcp_vector_search/core/models.py +21 -19
  28. mcp_vector_search/core/project.py +73 -58
  29. mcp_vector_search/core/scheduler.py +330 -0
  30. mcp_vector_search/core/search.py +574 -86
  31. mcp_vector_search/core/watcher.py +48 -46
  32. mcp_vector_search/mcp/__init__.py +4 -0
  33. mcp_vector_search/mcp/__main__.py +25 -0
  34. mcp_vector_search/mcp/server.py +701 -0
  35. mcp_vector_search/parsers/base.py +30 -31
  36. mcp_vector_search/parsers/javascript.py +74 -48
  37. mcp_vector_search/parsers/python.py +57 -49
  38. mcp_vector_search/parsers/registry.py +47 -32
  39. mcp_vector_search/parsers/text.py +179 -0
  40. mcp_vector_search/utils/__init__.py +40 -0
  41. mcp_vector_search/utils/gitignore.py +229 -0
  42. mcp_vector_search/utils/timing.py +334 -0
  43. mcp_vector_search/utils/version.py +47 -0
  44. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.11.dist-info}/METADATA +173 -7
  45. mcp_vector_search-0.4.11.dist-info/RECORD +54 -0
  46. mcp_vector_search-0.0.3.dist-info/RECORD +0 -35
  47. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.11.dist-info}/WHEEL +0 -0
  48. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.11.dist-info}/entry_points.txt +0 -0
  49. {mcp_vector_search-0.0.3.dist-info → mcp_vector_search-0.4.11.dist-info}/licenses/LICENSE +0 -0
@@ -2,10 +2,11 @@
2
2
 
3
3
  import re
4
4
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional
5
+ from typing import Any
6
6
 
7
7
  from loguru import logger
8
8
 
9
+ from .auto_indexer import AutoIndexer, SearchTriggeredIndexer
9
10
  from .database import VectorDatabase
10
11
  from .exceptions import SearchError
11
12
  from .models import SearchResult
@@ -18,48 +19,70 @@ class SemanticSearchEngine:
18
19
  self,
19
20
  database: VectorDatabase,
20
21
  project_root: Path,
21
- similarity_threshold: float = 0.7,
22
+ similarity_threshold: float = 0.3,
23
+ auto_indexer: AutoIndexer | None = None,
24
+ enable_auto_reindex: bool = True,
22
25
  ) -> None:
23
26
  """Initialize semantic search engine.
24
-
27
+
25
28
  Args:
26
29
  database: Vector database instance
27
30
  project_root: Project root directory
28
31
  similarity_threshold: Default similarity threshold
32
+ auto_indexer: Optional auto-indexer for semi-automatic reindexing
33
+ enable_auto_reindex: Whether to enable automatic reindexing
29
34
  """
30
35
  self.database = database
31
36
  self.project_root = project_root
32
37
  self.similarity_threshold = similarity_threshold
38
+ self.auto_indexer = auto_indexer
39
+ self.enable_auto_reindex = enable_auto_reindex
40
+
41
+ # Initialize search-triggered indexer if auto-indexer is provided
42
+ self.search_triggered_indexer = None
43
+ if auto_indexer and enable_auto_reindex:
44
+ self.search_triggered_indexer = SearchTriggeredIndexer(auto_indexer)
33
45
 
34
46
  async def search(
35
47
  self,
36
48
  query: str,
37
49
  limit: int = 10,
38
- filters: Optional[Dict[str, Any]] = None,
39
- similarity_threshold: Optional[float] = None,
50
+ filters: dict[str, Any] | None = None,
51
+ similarity_threshold: float | None = None,
40
52
  include_context: bool = True,
41
- ) -> List[SearchResult]:
53
+ ) -> list[SearchResult]:
42
54
  """Perform semantic search for code.
43
-
55
+
44
56
  Args:
45
57
  query: Search query
46
58
  limit: Maximum number of results
47
59
  filters: Optional filters (language, file_path, etc.)
48
60
  similarity_threshold: Minimum similarity score
49
61
  include_context: Whether to include context lines
50
-
62
+
51
63
  Returns:
52
64
  List of search results
53
65
  """
54
66
  if not query.strip():
55
67
  return []
56
68
 
57
- threshold = similarity_threshold or self.similarity_threshold
69
+ # Auto-reindex check before search
70
+ if self.search_triggered_indexer:
71
+ try:
72
+ await self.search_triggered_indexer.pre_search_hook()
73
+ except Exception as e:
74
+ logger.warning(f"Auto-reindex check failed: {e}")
75
+
76
+ threshold = (
77
+ similarity_threshold
78
+ if similarity_threshold is not None
79
+ else self._get_adaptive_threshold(query)
80
+ )
58
81
 
59
82
  try:
60
83
  # Preprocess query
61
84
  processed_query = self._preprocess_query(query)
62
-
85
+
63
86
  # Perform vector search
64
87
  results = await self.database.search(
65
88
  query=processed_query,
@@ -77,7 +100,9 @@ class SemanticSearchEngine:
77
100
  # Apply additional ranking if needed
78
101
  ranked_results = self._rerank_results(enhanced_results, query)
79
102
 
80
- logger.debug(f"Search for '{query}' returned {len(ranked_results)} results")
103
+ logger.debug(
104
+ f"Search for '{query}' with threshold {threshold:.3f} returned {len(ranked_results)} results"
105
+ )
81
106
  return ranked_results
82
107
 
83
108
  except Exception as e:
@@ -87,29 +112,31 @@ class SemanticSearchEngine:
87
112
  async def search_similar(
88
113
  self,
89
114
  file_path: Path,
90
- function_name: Optional[str] = None,
115
+ function_name: str | None = None,
91
116
  limit: int = 10,
92
- similarity_threshold: Optional[float] = None,
93
- ) -> List[SearchResult]:
117
+ similarity_threshold: float | None = None,
118
+ ) -> list[SearchResult]:
94
119
  """Find code similar to a specific function or file.
95
-
120
+
96
121
  Args:
97
122
  file_path: Path to the reference file
98
123
  function_name: Specific function name (optional)
99
124
  limit: Maximum number of results
100
125
  similarity_threshold: Minimum similarity score
101
-
126
+
102
127
  Returns:
103
128
  List of similar code results
104
129
  """
105
130
  try:
106
131
  # Read the reference file
107
- with open(file_path, "r", encoding="utf-8") as f:
132
+ with open(file_path, encoding="utf-8") as f:
108
133
  content = f.read()
109
134
 
110
135
  # If function name is specified, try to extract just that function
111
136
  if function_name:
112
- function_content = self._extract_function_content(content, function_name)
137
+ function_content = self._extract_function_content(
138
+ content, function_name
139
+ )
113
140
  if function_content:
114
141
  content = function_content
115
142
 
@@ -128,27 +155,27 @@ class SemanticSearchEngine:
128
155
  async def search_by_context(
129
156
  self,
130
157
  context_description: str,
131
- focus_areas: Optional[List[str]] = None,
158
+ focus_areas: list[str] | None = None,
132
159
  limit: int = 10,
133
- ) -> List[SearchResult]:
160
+ ) -> list[SearchResult]:
134
161
  """Search for code based on contextual description.
135
-
162
+
136
163
  Args:
137
164
  context_description: Description of what you're looking for
138
165
  focus_areas: Areas to focus on (e.g., ["security", "authentication"])
139
166
  limit: Maximum number of results
140
-
167
+
141
168
  Returns:
142
169
  List of contextually relevant results
143
170
  """
144
171
  # Build enhanced query with focus areas
145
172
  query_parts = [context_description]
146
-
173
+
147
174
  if focus_areas:
148
175
  query_parts.extend(focus_areas)
149
-
176
+
150
177
  enhanced_query = " ".join(query_parts)
151
-
178
+
152
179
  return await self.search(
153
180
  query=enhanced_query,
154
181
  limit=limit,
@@ -157,46 +184,161 @@ class SemanticSearchEngine:
157
184
 
158
185
  def _preprocess_query(self, query: str) -> str:
159
186
  """Preprocess search query for better results.
160
-
187
+
161
188
  Args:
162
189
  query: Raw search query
163
-
190
+
164
191
  Returns:
165
192
  Processed query
166
193
  """
167
194
  # Remove extra whitespace
168
195
  query = re.sub(r"\s+", " ", query.strip())
169
-
170
- # Expand common abbreviations
196
+
197
+ # Expand common programming abbreviations and synonyms
171
198
  expansions = {
172
- "auth": "authentication",
173
- "db": "database",
174
- "api": "application programming interface",
175
- "ui": "user interface",
176
- "util": "utility",
177
- "config": "configuration",
199
+ "auth": "authentication authorize login",
200
+ "db": "database data storage",
201
+ "api": "application programming interface endpoint",
202
+ "ui": "user interface frontend view",
203
+ "util": "utility helper function",
204
+ "config": "configuration settings options",
205
+ "async": "asynchronous await promise",
206
+ "sync": "synchronous blocking",
207
+ "func": "function method",
208
+ "var": "variable",
209
+ "param": "parameter argument",
210
+ "init": "initialize setup create",
211
+ "parse": "parsing parser analyze",
212
+ "validate": "validation check verify",
213
+ "handle": "handler process manage",
214
+ "error": "exception failure bug",
215
+ "test": "testing unittest spec",
216
+ "mock": "mocking stub fake",
217
+ "log": "logging logger debug",
178
218
  }
179
-
219
+
220
+ # Add programming language keywords and concepts
221
+ programming_concepts = {
222
+ "class": "class object type",
223
+ "method": "method function procedure",
224
+ "property": "property attribute field",
225
+ "import": "import require include",
226
+ "export": "export module public",
227
+ "return": "return yield output",
228
+ "loop": "loop iterate for while",
229
+ "condition": "condition if else branch",
230
+ "array": "array list collection",
231
+ "string": "string text character",
232
+ "number": "number integer float",
233
+ "boolean": "boolean true false",
234
+ }
235
+
236
+ # Merge all expansions
237
+ all_expansions = {**expansions, **programming_concepts}
238
+
180
239
  words = query.lower().split()
181
240
  expanded_words = []
182
-
241
+
183
242
  for word in words:
184
- if word in expansions:
185
- expanded_words.extend([word, expansions[word]])
186
- else:
187
- expanded_words.append(word)
188
-
189
- return " ".join(expanded_words)
243
+ # Add original word
244
+ expanded_words.append(word)
245
+
246
+ # Add expansions if available
247
+ if word in all_expansions:
248
+ expanded_words.extend(all_expansions[word].split())
249
+
250
+ # Remove duplicates while preserving order
251
+ seen = set()
252
+ unique_words = []
253
+ for word in expanded_words:
254
+ if word not in seen:
255
+ seen.add(word)
256
+ unique_words.append(word)
257
+
258
+ return " ".join(unique_words)
259
+
260
+ def _get_adaptive_threshold(self, query: str) -> float:
261
+ """Get adaptive similarity threshold based on query characteristics.
262
+
263
+ Args:
264
+ query: Search query
265
+
266
+ Returns:
267
+ Adaptive similarity threshold
268
+ """
269
+ base_threshold = self.similarity_threshold
270
+ query_lower = query.lower()
271
+ words = query.split()
272
+
273
+ # Adjust threshold based on query characteristics
274
+
275
+ # 1. Single word queries - lower threshold for broader results
276
+ if len(words) == 1:
277
+ return max(0.01, base_threshold - 0.29)
278
+
279
+ # 2. Very specific technical terms - lower threshold
280
+ technical_terms = [
281
+ "javascript",
282
+ "typescript",
283
+ "python",
284
+ "java",
285
+ "cpp",
286
+ "rust",
287
+ "go",
288
+ "function",
289
+ "class",
290
+ "method",
291
+ "variable",
292
+ "import",
293
+ "export",
294
+ "async",
295
+ "await",
296
+ "promise",
297
+ "callback",
298
+ "api",
299
+ "database",
300
+ "parser",
301
+ "compiler",
302
+ "interpreter",
303
+ "syntax",
304
+ "semantic",
305
+ "mcp",
306
+ "gateway",
307
+ "server",
308
+ "client",
309
+ "protocol",
310
+ ]
311
+
312
+ if any(term in query_lower for term in technical_terms):
313
+ return max(0.01, base_threshold - 0.29)
314
+
315
+ # 3. Short queries (2-3 words) - slightly lower threshold
316
+ if len(words) <= 3:
317
+ return max(0.1, base_threshold - 0.1)
318
+
319
+ # 4. Long queries (>6 words) - higher threshold for precision
320
+ if len(words) > 6:
321
+ return min(0.8, base_threshold + 0.1)
322
+
323
+ # 5. Queries with exact identifiers (CamelCase, snake_case)
324
+ if re.search(r"\b[A-Z][a-zA-Z]*\b", query) or "_" in query:
325
+ return max(0.05, base_threshold - 0.25)
326
+
327
+ # 6. Common programming patterns
328
+ if any(pattern in query for pattern in ["()", ".", "->", "=>", "::"]):
329
+ return max(0.25, base_threshold - 0.1)
330
+
331
+ return base_threshold
190
332
 
191
333
  async def _enhance_result(
192
334
  self, result: SearchResult, include_context: bool
193
335
  ) -> SearchResult:
194
336
  """Enhance search result with additional information.
195
-
337
+
196
338
  Args:
197
339
  result: Original search result
198
340
  include_context: Whether to include context lines
199
-
341
+
200
342
  Returns:
201
343
  Enhanced search result
202
344
  """
@@ -205,7 +347,7 @@ class SemanticSearchEngine:
205
347
 
206
348
  try:
207
349
  # Read the source file to get context
208
- with open(result.file_path, "r", encoding="utf-8") as f:
350
+ with open(result.file_path, encoding="utf-8") as f:
209
351
  lines = f.readlines()
210
352
 
211
353
  # Get context lines before and after
@@ -216,9 +358,7 @@ class SemanticSearchEngine:
216
358
  context_before = [
217
359
  line.rstrip() for line in lines[start_idx : result.start_line - 1]
218
360
  ]
219
- context_after = [
220
- line.rstrip() for line in lines[result.end_line : end_idx]
221
- ]
361
+ context_after = [line.rstrip() for line in lines[result.end_line : end_idx]]
222
362
 
223
363
  # Update result with context
224
364
  result.context_before = context_before
@@ -230,66 +370,414 @@ class SemanticSearchEngine:
230
370
  return result
231
371
 
232
372
  def _rerank_results(
233
- self, results: List[SearchResult], query: str
234
- ) -> List[SearchResult]:
235
- """Apply additional ranking to search results.
236
-
373
+ self, results: list[SearchResult], query: str
374
+ ) -> list[SearchResult]:
375
+ """Apply advanced ranking to search results using multiple factors.
376
+
237
377
  Args:
238
378
  results: Original search results
239
379
  query: Original search query
240
-
380
+
241
381
  Returns:
242
382
  Reranked search results
243
383
  """
244
- # Simple reranking based on additional factors
384
+ if not results:
385
+ return results
386
+
245
387
  query_lower = query.lower()
246
-
388
+ query_words = set(query_lower.split())
389
+
390
+ for result in results:
391
+ # Start with base similarity score
392
+ score = result.similarity_score
393
+
394
+ # Factor 1: Exact matches in identifiers (high boost)
395
+ if result.function_name:
396
+ func_name_lower = result.function_name.lower()
397
+ if query_lower in func_name_lower:
398
+ score += 0.15 # Strong boost for function name match
399
+ # Partial word matches
400
+ for word in query_words:
401
+ if word in func_name_lower:
402
+ score += 0.05
403
+
404
+ if result.class_name:
405
+ class_name_lower = result.class_name.lower()
406
+ if query_lower in class_name_lower:
407
+ score += 0.15 # Strong boost for class name match
408
+ # Partial word matches
409
+ for word in query_words:
410
+ if word in class_name_lower:
411
+ score += 0.05
412
+
413
+ # Factor 2: File name relevance
414
+ file_name_lower = result.file_path.name.lower()
415
+ if query_lower in file_name_lower:
416
+ score += 0.08
417
+ for word in query_words:
418
+ if word in file_name_lower:
419
+ score += 0.03
420
+
421
+ # Factor 3: Content density (how many query words appear)
422
+ content_lower = result.content.lower()
423
+ word_matches = sum(1 for word in query_words if word in content_lower)
424
+ if word_matches > 0:
425
+ density_boost = (word_matches / len(query_words)) * 0.1
426
+ score += density_boost
427
+
428
+ # Factor 4: Code structure preferences
429
+ # Boost functions over general code blocks
430
+ if result.chunk_type == "function":
431
+ score += 0.05
432
+ elif result.chunk_type == "class":
433
+ score += 0.03
434
+
435
+ # Factor 5: File type preferences (prefer source files over tests/docs)
436
+ file_ext = result.file_path.suffix.lower()
437
+ if file_ext in [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs"]:
438
+ score += 0.02
439
+ elif "test" in result.file_path.name.lower():
440
+ score -= 0.02 # Slightly penalize test files unless specifically searching for tests
441
+
442
+ # Factor 6: Recency bias (prefer shorter file paths - often more core files)
443
+ path_depth = len(result.file_path.parts)
444
+ if path_depth <= 3:
445
+ score += 0.02
446
+ elif path_depth > 5:
447
+ score -= 0.01
448
+
449
+ # Ensure score doesn't exceed 1.0
450
+ result.similarity_score = min(1.0, score)
451
+
452
+ # Sort by enhanced similarity score
453
+ results.sort(key=lambda r: r.similarity_score, reverse=True)
454
+
455
+ # Update ranks
456
+ for i, result in enumerate(results):
457
+ result.rank = i + 1
458
+
459
+ return results
460
+
461
+ def analyze_query(self, query: str) -> dict[str, Any]:
462
+ """Analyze search query and provide suggestions for improvement.
463
+
464
+ Args:
465
+ query: Search query to analyze
466
+
467
+ Returns:
468
+ Dictionary with analysis results and suggestions
469
+ """
470
+ analysis = {
471
+ "original_query": query,
472
+ "processed_query": self._preprocess_query(query),
473
+ "query_type": "general",
474
+ "suggestions": [],
475
+ "confidence": "medium",
476
+ }
477
+
478
+ query_lower = query.lower()
479
+
480
+ # Detect query type
481
+ if any(word in query_lower for word in ["function", "method", "def", "func"]):
482
+ analysis["query_type"] = "function_search"
483
+ analysis["suggestions"].append(
484
+ "Try searching for specific function names or patterns"
485
+ )
486
+ elif any(word in query_lower for word in ["class", "object", "type"]):
487
+ analysis["query_type"] = "class_search"
488
+ analysis["suggestions"].append(
489
+ "Include class inheritance or interface information"
490
+ )
491
+ elif any(word in query_lower for word in ["error", "exception", "bug", "fix"]):
492
+ analysis["query_type"] = "error_handling"
493
+ analysis["suggestions"].append("Include error types or exception names")
494
+ elif any(word in query_lower for word in ["test", "spec", "mock"]):
495
+ analysis["query_type"] = "testing"
496
+ analysis["suggestions"].append("Specify test framework or testing patterns")
497
+ elif any(word in query_lower for word in ["config", "setting", "option"]):
498
+ analysis["query_type"] = "configuration"
499
+ analysis["suggestions"].append(
500
+ "Include configuration file types or setting names"
501
+ )
502
+
503
+ # Analyze query complexity
504
+ words = query.split()
505
+ if len(words) == 1:
506
+ analysis["confidence"] = "low"
507
+ analysis["suggestions"].append(
508
+ "Try adding more descriptive words for better results"
509
+ )
510
+ elif len(words) > 10:
511
+ analysis["confidence"] = "low"
512
+ analysis["suggestions"].append(
513
+ "Consider simplifying your query for better matching"
514
+ )
515
+ else:
516
+ analysis["confidence"] = "high"
517
+
518
+ # Check for common programming patterns
519
+ if re.search(r"\b\w+\(\)", query):
520
+ analysis["suggestions"].append(
521
+ "Function call detected - searching for function definitions"
522
+ )
523
+ if re.search(r"\b[A-Z][a-zA-Z]*\b", query):
524
+ analysis["suggestions"].append(
525
+ "CamelCase detected - searching for class or type names"
526
+ )
527
+ if re.search(r"\b\w+\.\w+", query):
528
+ analysis["suggestions"].append(
529
+ "Dot notation detected - searching for method calls or properties"
530
+ )
531
+
532
+ return analysis
533
+
534
+ def suggest_related_queries(
535
+ self, query: str, results: list[SearchResult]
536
+ ) -> list[str]:
537
+ """Suggest related queries based on search results.
538
+
539
+ Args:
540
+ query: Original search query
541
+ results: Search results
542
+
543
+ Returns:
544
+ List of suggested related queries
545
+ """
546
+ suggestions = []
547
+
548
+ if not results:
549
+ # No results - suggest broader queries
550
+ words = query.lower().split()
551
+ if len(words) > 1:
552
+ # Try individual words
553
+ suggestions.extend(words[:3]) # Top 3 words
554
+
555
+ # Suggest common related terms
556
+ related_terms = {
557
+ "auth": ["login", "user", "session", "token"],
558
+ "database": ["query", "model", "schema", "connection"],
559
+ "api": ["endpoint", "request", "response", "handler"],
560
+ "test": ["mock", "assert", "spec", "unit"],
561
+ "error": ["exception", "handle", "catch", "debug"],
562
+ }
563
+
564
+ for word in words:
565
+ if word in related_terms:
566
+ suggestions.extend(related_terms[word][:2])
567
+ else:
568
+ # Extract common patterns from results
569
+ function_names = [r.function_name for r in results if r.function_name]
570
+ class_names = [r.class_name for r in results if r.class_name]
571
+
572
+ # Suggest function names
573
+ if function_names:
574
+ unique_functions = list(set(function_names))[:3]
575
+ suggestions.extend(unique_functions)
576
+
577
+ # Suggest class names
578
+ if class_names:
579
+ unique_classes = list(set(class_names))[:3]
580
+ suggestions.extend(unique_classes)
581
+
582
+ # Suggest file-based queries
583
+ file_patterns = set()
584
+ for result in results[:5]: # Top 5 results
585
+ file_name = result.file_path.stem
586
+ if "_" in file_name:
587
+ file_patterns.update(file_name.split("_"))
588
+ elif file_name not in suggestions:
589
+ file_patterns.add(file_name)
590
+
591
+ suggestions.extend(list(file_patterns)[:3])
592
+
593
+ # Remove duplicates and original query words
594
+ query_words = set(query.lower().split())
595
+ unique_suggestions = []
596
+ for suggestion in suggestions:
597
+ if (
598
+ suggestion
599
+ and suggestion.lower() not in query_words
600
+ and suggestion not in unique_suggestions
601
+ ):
602
+ unique_suggestions.append(suggestion)
603
+
604
+ return unique_suggestions[:5] # Return top 5 suggestions
605
+
606
+ async def search_with_context(
607
+ self,
608
+ query: str,
609
+ context_files: list[Path] | None = None,
610
+ limit: int = 10,
611
+ similarity_threshold: float | None = None,
612
+ ) -> dict[str, Any]:
613
+ """Enhanced search with contextual analysis and suggestions.
614
+
615
+ Args:
616
+ query: Search query
617
+ context_files: Optional list of files to provide context
618
+ limit: Maximum number of results
619
+ similarity_threshold: Minimum similarity score
620
+
621
+ Returns:
622
+ Dictionary with results, analysis, and suggestions
623
+ """
624
+ # Analyze the query
625
+ query_analysis = self.analyze_query(query)
626
+
627
+ # Perform the search
628
+ results = await self.search(
629
+ query=query,
630
+ limit=limit,
631
+ similarity_threshold=similarity_threshold,
632
+ include_context=True,
633
+ )
634
+
635
+ # Get related query suggestions
636
+ suggestions = self.suggest_related_queries(query, results)
637
+
638
+ # Enhance results with additional context if context files provided
639
+ if context_files:
640
+ results = await self._enhance_with_file_context(results, context_files)
641
+
642
+ # Calculate result quality metrics
643
+ quality_metrics = self._calculate_result_quality(results, query)
644
+
645
+ return {
646
+ "query": query,
647
+ "analysis": query_analysis,
648
+ "results": results,
649
+ "suggestions": suggestions,
650
+ "metrics": quality_metrics,
651
+ "total_results": len(results),
652
+ }
653
+
654
+ async def _enhance_with_file_context(
655
+ self, results: list[SearchResult], context_files: list[Path]
656
+ ) -> list[SearchResult]:
657
+ """Enhance results by considering context from specific files.
658
+
659
+ Args:
660
+ results: Original search results
661
+ context_files: Files to use for context
662
+
663
+ Returns:
664
+ Enhanced search results
665
+ """
666
+ # Read context from files
667
+ context_content = []
668
+ for file_path in context_files:
669
+ try:
670
+ with open(file_path, encoding="utf-8") as f:
671
+ content = f.read()
672
+ context_content.append(content)
673
+ except Exception as e:
674
+ logger.warning(f"Failed to read context file {file_path}: {e}")
675
+
676
+ if not context_content:
677
+ return results
678
+
679
+ # Boost results that are related to context files
680
+ context_text = " ".join(context_content).lower()
681
+
247
682
  for result in results:
248
- # Boost score for exact matches in function/class names
249
- boost = 0.0
250
-
251
- if result.function_name and query_lower in result.function_name.lower():
252
- boost += 0.1
253
-
254
- if result.class_name and query_lower in result.class_name.lower():
255
- boost += 0.1
256
-
257
- # Boost score for matches in file name
258
- if query_lower in result.file_path.name.lower():
259
- boost += 0.05
260
-
261
- # Apply boost
262
- result.similarity_score = min(1.0, result.similarity_score + boost)
263
-
264
- # Re-sort by similarity score
683
+ # Check if result is from one of the context files
684
+ if result.file_path in context_files:
685
+ result.similarity_score = min(1.0, result.similarity_score + 0.1)
686
+
687
+ # Check if result content relates to context
688
+ result.content.lower()
689
+ if result.function_name:
690
+ func_name_lower = result.function_name.lower()
691
+ if func_name_lower in context_text:
692
+ result.similarity_score = min(1.0, result.similarity_score + 0.05)
693
+
694
+ if result.class_name:
695
+ class_name_lower = result.class_name.lower()
696
+ if class_name_lower in context_text:
697
+ result.similarity_score = min(1.0, result.similarity_score + 0.05)
698
+
699
+ # Re-sort by updated scores
265
700
  results.sort(key=lambda r: r.similarity_score, reverse=True)
266
-
701
+
267
702
  # Update ranks
268
703
  for i, result in enumerate(results):
269
704
  result.rank = i + 1
270
-
705
+
271
706
  return results
272
707
 
273
- def _extract_function_content(self, content: str, function_name: str) -> Optional[str]:
708
+ def _calculate_result_quality(
709
+ self, results: list[SearchResult], query: str
710
+ ) -> dict[str, Any]:
711
+ """Calculate quality metrics for search results.
712
+
713
+ Args:
714
+ results: Search results
715
+ query: Original query
716
+
717
+ Returns:
718
+ Dictionary with quality metrics
719
+ """
720
+ if not results:
721
+ return {
722
+ "average_score": 0.0,
723
+ "score_distribution": {},
724
+ "diversity": 0.0,
725
+ "coverage": 0.0,
726
+ }
727
+
728
+ # Calculate average similarity score
729
+ scores = [r.similarity_score for r in results]
730
+ avg_score = sum(scores) / len(scores)
731
+
732
+ # Score distribution
733
+ high_quality = sum(1 for s in scores if s >= 0.8)
734
+ medium_quality = sum(1 for s in scores if 0.6 <= s < 0.8)
735
+ low_quality = sum(1 for s in scores if s < 0.6)
736
+
737
+ # Diversity (unique files)
738
+ unique_files = len({r.file_path for r in results})
739
+ diversity = unique_files / len(results) if results else 0.0
740
+
741
+ # Coverage (how many query words are covered)
742
+ query_words = set(query.lower().split())
743
+ covered_words = set()
744
+ for result in results:
745
+ content_words = set(result.content.lower().split())
746
+ covered_words.update(query_words.intersection(content_words))
747
+
748
+ coverage = len(covered_words) / len(query_words) if query_words else 0.0
749
+
750
+ return {
751
+ "average_score": round(avg_score, 3),
752
+ "score_distribution": {
753
+ "high_quality": high_quality,
754
+ "medium_quality": medium_quality,
755
+ "low_quality": low_quality,
756
+ },
757
+ "diversity": round(diversity, 3),
758
+ "coverage": round(coverage, 3),
759
+ }
760
+
761
+ def _extract_function_content(self, content: str, function_name: str) -> str | None:
274
762
  """Extract content of a specific function from code.
275
-
763
+
276
764
  Args:
277
765
  content: Full file content
278
766
  function_name: Name of function to extract
279
-
767
+
280
768
  Returns:
281
769
  Function content if found, None otherwise
282
770
  """
283
771
  # Simple regex-based extraction (could be improved with AST)
284
772
  pattern = rf"^\s*def\s+{re.escape(function_name)}\s*\("
285
773
  lines = content.splitlines()
286
-
774
+
287
775
  for i, line in enumerate(lines):
288
776
  if re.match(pattern, line):
289
777
  # Found function start, now find the end
290
778
  start_line = i
291
779
  indent_level = len(line) - len(line.lstrip())
292
-
780
+
293
781
  # Find end of function
294
782
  end_line = len(lines)
295
783
  for j in range(i + 1, len(lines)):
@@ -298,27 +786,27 @@ class SemanticSearchEngine:
298
786
  if current_indent <= indent_level:
299
787
  end_line = j
300
788
  break
301
-
789
+
302
790
  return "\n".join(lines[start_line:end_line])
303
-
791
+
304
792
  return None
305
793
 
306
- async def get_search_stats(self) -> Dict[str, Any]:
794
+ async def get_search_stats(self) -> dict[str, Any]:
307
795
  """Get search engine statistics.
308
-
796
+
309
797
  Returns:
310
798
  Dictionary with search statistics
311
799
  """
312
800
  try:
313
801
  db_stats = await self.database.get_stats()
314
-
802
+
315
803
  return {
316
804
  "total_chunks": db_stats.total_chunks,
317
805
  "languages": db_stats.languages,
318
806
  "similarity_threshold": self.similarity_threshold,
319
807
  "project_root": str(self.project_root),
320
808
  }
321
-
809
+
322
810
  except Exception as e:
323
811
  logger.error(f"Failed to get search stats: {e}")
324
812
  return {"error": str(e)}