stravinsky 0.2.67__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of stravinsky might be problematic. Click here for more details.

@@ -0,0 +1,3042 @@
1
+ """
2
+ Semantic Code Search - Vector-based code understanding
3
+
4
+ Uses ChromaDB for persistent vector storage with multiple embedding providers:
5
+ - Ollama (local, free) - nomic-embed-text (768 dims)
6
+ - Mxbai (local, free) - mxbai-embed-large (1024 dims, better for code)
7
+ - Gemini (cloud, OAuth) - gemini-embedding-001 (768-3072 dims)
8
+ - OpenAI (cloud, OAuth) - text-embedding-3-small (1536 dims)
9
+ - HuggingFace (cloud, token) - sentence-transformers/all-mpnet-base-v2 (768 dims)
10
+
11
+ Enables natural language queries like "find authentication logic" without
12
+ requiring exact pattern matching.
13
+
14
+ Architecture:
15
+ - Per-project ChromaDB storage at ~/.stravinsky/vectordb/<project_hash>/
16
+ - Lazy initialization on first query
17
+ - Provider abstraction for embedding generation
18
+ - Chunking strategy: function/class level with context
19
+ """
20
+
21
+ import atexit
22
+ import hashlib
23
+ import logging
24
+ import sys
25
+ import threading
26
+ from abc import ABC, abstractmethod
27
+ from pathlib import Path
28
+ from typing import Literal
29
+
30
+ import httpx
31
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
32
+
33
+ from mcp_bridge.auth.token_store import TokenStore
34
+ from mcp_bridge.tools.query_classifier import QueryCategory, classify_query
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ # Lazy imports for watchdog (avoid startup cost)
40
+ _watchdog = None
41
+ _watchdog_import_lock = threading.Lock()
42
+
43
+
44
+ def get_watchdog():
45
+ """Lazy import of watchdog components for file watching."""
46
+ global _watchdog
47
+ if _watchdog is None:
48
+ with _watchdog_import_lock:
49
+ if _watchdog is None:
50
+ from watchdog.events import FileSystemEventHandler
51
+ from watchdog.observers import Observer
52
+
53
+ _watchdog = {"Observer": Observer, "FileSystemEventHandler": FileSystemEventHandler}
54
+ return _watchdog
55
+
56
+
57
+ # Embedding provider type
58
+ EmbeddingProvider = Literal["ollama", "mxbai", "gemini", "openai", "huggingface"]
59
+
60
+ # Lazy imports to avoid startup cost
61
+ _chromadb = None
62
+ _ollama = None
63
+ _httpx = None
64
+ _filelock = None
65
+ _import_lock = threading.Lock()
66
+
67
+
68
+ def get_filelock():
69
+ global _filelock
70
+ if _filelock is None:
71
+ with _import_lock:
72
+ if _filelock is None:
73
+ import filelock
74
+
75
+ _filelock = filelock
76
+ return _filelock
77
+
78
+
79
+ def get_chromadb():
80
+ global _chromadb
81
+ if _chromadb is None:
82
+ with _import_lock:
83
+ if _chromadb is None:
84
+ try:
85
+ import chromadb
86
+
87
+ _chromadb = chromadb
88
+ except ImportError as e:
89
+ import sys
90
+
91
+ if sys.version_info >= (3, 14):
92
+ raise ImportError(
93
+ "ChromaDB is not available on Python 3.14+. "
94
+ "Semantic search is not supported on Python 3.14 yet. "
95
+ "Use Python 3.11-3.13 for semantic search features."
96
+ ) from e
97
+ raise
98
+ return _chromadb
99
+
100
+
101
+ def get_ollama():
102
+ global _ollama
103
+ if _ollama is None:
104
+ with _import_lock:
105
+ if _ollama is None:
106
+ import ollama
107
+
108
+ _ollama = ollama
109
+ return _ollama
110
+
111
+
112
+ def get_httpx():
113
+ global _httpx
114
+ if _httpx is None:
115
+ with _import_lock:
116
+ if _httpx is None:
117
+ import httpx
118
+
119
+ _httpx = httpx
120
+ return _httpx
121
+
122
+
123
+ # ========================
124
+ # GITIGNORE MANAGER
125
+ # ========================
126
+
127
+ # Lazy import for pathspec
128
+ _pathspec = None
129
+ _pathspec_lock = threading.Lock()
130
+
131
+
132
+ def get_pathspec():
133
+ """Lazy import of pathspec for gitignore pattern matching."""
134
+ global _pathspec
135
+ if _pathspec is None:
136
+ with _pathspec_lock:
137
+ if _pathspec is None:
138
+ import pathspec
139
+
140
+ _pathspec = pathspec
141
+ return _pathspec
142
+
143
+
144
+ class GitIgnoreManager:
145
+ """Manages .gitignore and .stravignore pattern matching.
146
+
147
+ Loads and caches gitignore-style patterns from:
148
+ - .gitignore (standard git ignore patterns)
149
+ - .stravignore (Stravinsky-specific ignore patterns)
150
+
151
+ Patterns are combined and cached per project for efficient matching.
152
+ The manager automatically reloads patterns if the ignore files are modified.
153
+ """
154
+
155
+ # Cache of GitIgnoreManager instances per project path
156
+ _instances: dict[str, "GitIgnoreManager"] = {}
157
+ _instances_lock = threading.Lock()
158
+
159
+ @classmethod
160
+ def get_instance(cls, project_path: Path) -> "GitIgnoreManager":
161
+ """Get or create a GitIgnoreManager for a project.
162
+
163
+ Args:
164
+ project_path: Root path of the project
165
+
166
+ Returns:
167
+ Cached GitIgnoreManager instance for the project
168
+ """
169
+ path_str = str(project_path.resolve())
170
+ if path_str not in cls._instances:
171
+ with cls._instances_lock:
172
+ if path_str not in cls._instances:
173
+ cls._instances[path_str] = cls(project_path)
174
+ return cls._instances[path_str]
175
+
176
+ @classmethod
177
+ def clear_cache(cls, project_path: Path | None = None) -> None:
178
+ """Clear cached GitIgnoreManager instances.
179
+
180
+ Args:
181
+ project_path: Clear only this project's cache, or all if None
182
+ """
183
+ with cls._instances_lock:
184
+ if project_path is None:
185
+ cls._instances.clear()
186
+ else:
187
+ path_str = str(project_path.resolve())
188
+ cls._instances.pop(path_str, None)
189
+
190
+ def __init__(self, project_path: Path):
191
+ """Initialize the GitIgnoreManager.
192
+
193
+ Args:
194
+ project_path: Root path of the project
195
+ """
196
+ self.project_path = project_path.resolve()
197
+ self._spec = None
198
+ self._gitignore_mtime: float | None = None
199
+ self._stravignore_mtime: float | None = None
200
+ self._lock = threading.Lock()
201
+
202
+ def _get_file_mtime(self, file_path: Path) -> float | None:
203
+ """Get modification time of a file, or None if it doesn't exist."""
204
+ try:
205
+ return file_path.stat().st_mtime
206
+ except (OSError, FileNotFoundError):
207
+ return None
208
+
209
+ def _needs_reload(self) -> bool:
210
+ """Check if ignore patterns need to be reloaded."""
211
+ gitignore_path = self.project_path / ".gitignore"
212
+ stravignore_path = self.project_path / ".stravignore"
213
+
214
+ current_gitignore_mtime = self._get_file_mtime(gitignore_path)
215
+ current_stravignore_mtime = self._get_file_mtime(stravignore_path)
216
+
217
+ # Check if either file has been modified or if we haven't loaded yet
218
+ if self._spec is None:
219
+ return True
220
+
221
+ if current_gitignore_mtime != self._gitignore_mtime:
222
+ return True
223
+
224
+ if current_stravignore_mtime != self._stravignore_mtime:
225
+ return True
226
+
227
+ return False
228
+
229
+ def _load_patterns(self) -> None:
230
+ """Load patterns from .gitignore and .stravignore files."""
231
+ pathspec = get_pathspec()
232
+
233
+ patterns = []
234
+ gitignore_path = self.project_path / ".gitignore"
235
+ stravignore_path = self.project_path / ".stravignore"
236
+
237
+ # Load .gitignore patterns
238
+ if gitignore_path.exists():
239
+ try:
240
+ with open(gitignore_path, encoding="utf-8") as f:
241
+ patterns.extend(f.read().splitlines())
242
+ self._gitignore_mtime = self._get_file_mtime(gitignore_path)
243
+ logger.debug(f"Loaded .gitignore from {gitignore_path}")
244
+ except Exception as e:
245
+ logger.warning(f"Failed to load .gitignore: {e}")
246
+ self._gitignore_mtime = None
247
+ else:
248
+ self._gitignore_mtime = None
249
+
250
+ # Load .stravignore patterns
251
+ if stravignore_path.exists():
252
+ try:
253
+ with open(stravignore_path, encoding="utf-8") as f:
254
+ patterns.extend(f.read().splitlines())
255
+ self._stravignore_mtime = self._get_file_mtime(stravignore_path)
256
+ logger.debug(f"Loaded .stravignore from {stravignore_path}")
257
+ except Exception as e:
258
+ logger.warning(f"Failed to load .stravignore: {e}")
259
+ self._stravignore_mtime = None
260
+ else:
261
+ self._stravignore_mtime = None
262
+
263
+ # Filter out empty lines and comments
264
+ patterns = [p for p in patterns if p.strip() and not p.strip().startswith("#")]
265
+
266
+ # Create pathspec matcher
267
+ self._spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
268
+ logger.debug(f"Loaded {len(patterns)} ignore patterns for {self.project_path}")
269
+
270
+ @property
271
+ def spec(self):
272
+ """Get the PathSpec matcher, reloading if necessary."""
273
+ with self._lock:
274
+ if self._needs_reload():
275
+ self._load_patterns()
276
+ return self._spec
277
+
278
+ def is_ignored(self, file_path: Path) -> bool:
279
+ """Check if a file path should be ignored.
280
+
281
+ Args:
282
+ file_path: Absolute or relative path to check
283
+
284
+ Returns:
285
+ True if the file matches any ignore pattern, False otherwise
286
+ """
287
+ try:
288
+ # Convert to relative path from project root
289
+ if file_path.is_absolute():
290
+ rel_path = file_path.resolve().relative_to(self.project_path)
291
+ else:
292
+ rel_path = file_path
293
+
294
+ # pathspec expects forward slashes and string paths
295
+ rel_path_str = str(rel_path).replace("\\", "/")
296
+
297
+ # Check against patterns
298
+ return self.spec.match_file(rel_path_str)
299
+ except ValueError:
300
+ # Path is outside project - not ignored by gitignore (but may be ignored for other reasons)
301
+ return False
302
+ except Exception as e:
303
+ logger.warning(f"Error checking ignore status for {file_path}: {e}")
304
+ return False
305
+
306
+ def filter_paths(self, paths: list[Path]) -> list[Path]:
307
+ """Filter a list of paths, removing ignored ones.
308
+
309
+ Args:
310
+ paths: List of paths to filter
311
+
312
+ Returns:
313
+ List of paths that are not ignored
314
+ """
315
+ return [p for p in paths if not self.is_ignored(p)]
316
+
317
+
318
+ # ========================
319
+ # EMBEDDING PROVIDERS
320
+ # ========================
321
+
322
+
323
+ class BaseEmbeddingProvider(ABC):
324
+ """Abstract base class for embedding providers."""
325
+
326
+ @abstractmethod
327
+ async def get_embedding(self, text: str) -> list[float]:
328
+ """Get embedding vector for text."""
329
+ pass
330
+
331
+ @abstractmethod
332
+ async def check_available(self) -> bool:
333
+ """Check if the provider is available and ready."""
334
+ pass
335
+
336
+ @property
337
+ @abstractmethod
338
+ def dimension(self) -> int:
339
+ """Return the embedding dimension for this provider."""
340
+ pass
341
+
342
+ @property
343
+ @abstractmethod
344
+ def name(self) -> str:
345
+ """Return the provider name."""
346
+ pass
347
+
348
+
349
+ class OllamaProvider(BaseEmbeddingProvider):
350
+ """Ollama local embedding provider using nomic-embed-text."""
351
+
352
+ MODEL = "nomic-embed-text"
353
+ DIMENSION = 768
354
+
355
+ def __init__(self):
356
+ self._available: bool | None = None
357
+
358
+ @property
359
+ def dimension(self) -> int:
360
+ return self.DIMENSION
361
+
362
+ @property
363
+ def name(self) -> str:
364
+ return "ollama"
365
+
366
+ async def check_available(self) -> bool:
367
+ if self._available is not None:
368
+ return self._available
369
+
370
+ try:
371
+ ollama = get_ollama()
372
+ models = ollama.list()
373
+ model_names = [m.model for m in models.models] if hasattr(models, "models") else []
374
+
375
+ if not any(name and self.MODEL in name for name in model_names):
376
+ print(
377
+ f"⚠️ Embedding model '{self.MODEL}' not found. Run: ollama pull {self.MODEL}",
378
+ file=sys.stderr,
379
+ )
380
+ self._available = False
381
+ return False
382
+
383
+ self._available = True
384
+ return True
385
+ except Exception as e:
386
+ print(f"⚠️ Ollama not available: {e}. Start with: ollama serve", file=sys.stderr)
387
+ self._available = False
388
+ return False
389
+
390
+ async def get_embedding(self, text: str) -> list[float]:
391
+ ollama = get_ollama()
392
+ # nomic-embed-text has 8192 token context. Code can be 1-2 chars/token.
393
+ # Truncate to 2000 chars (~1000-2000 tokens) for larger safety margin
394
+ truncated = text[:2000] if len(text) > 2000 else text
395
+ response = ollama.embeddings(model=self.MODEL, prompt=truncated)
396
+ return response["embedding"]
397
+
398
+
399
+ class GeminiProvider(BaseEmbeddingProvider):
400
+ """Gemini embedding provider using OAuth authentication."""
401
+
402
+ MODEL = "gemini-embedding-001"
403
+ DIMENSION = 768 # Using 768 for efficiency, can be up to 3072
404
+
405
+ def __init__(self):
406
+ self._available: bool | None = None
407
+ self._token_store = None
408
+
409
+ def _get_token_store(self):
410
+ if self._token_store is None:
411
+ from ..auth.token_store import TokenStore
412
+
413
+ self._token_store = TokenStore()
414
+ return self._token_store
415
+
416
+ @property
417
+ def dimension(self) -> int:
418
+ return self.DIMENSION
419
+
420
+ @property
421
+ def name(self) -> str:
422
+ return "gemini"
423
+
424
+ async def check_available(self) -> bool:
425
+ if self._available is not None:
426
+ return self._available
427
+
428
+ try:
429
+ token_store = self._get_token_store()
430
+ access_token = token_store.get_access_token("gemini")
431
+
432
+ if not access_token:
433
+ print(
434
+ "⚠️ Gemini not authenticated. Run: stravinsky-auth login gemini",
435
+ file=sys.stderr,
436
+ )
437
+ self._available = False
438
+ return False
439
+
440
+ self._available = True
441
+ return True
442
+ except Exception as e:
443
+ print(f"⚠️ Gemini not available: {e}", file=sys.stderr)
444
+ self._available = False
445
+ return False
446
+
447
+ async def get_embedding(self, text: str) -> list[float]:
448
+ import os
449
+
450
+ from ..auth.oauth import (
451
+ ANTIGRAVITY_DEFAULT_PROJECT_ID,
452
+ ANTIGRAVITY_ENDPOINTS,
453
+ ANTIGRAVITY_HEADERS,
454
+ )
455
+
456
+ token_store = self._get_token_store()
457
+ access_token = token_store.get_access_token("gemini")
458
+
459
+ if not access_token:
460
+ raise ValueError("Not authenticated with Gemini. Run: stravinsky-auth login gemini")
461
+
462
+ httpx = get_httpx()
463
+
464
+ # Use Antigravity endpoint for embeddings (same auth as invoke_gemini)
465
+ project_id = os.getenv("STRAVINSKY_ANTIGRAVITY_PROJECT_ID", ANTIGRAVITY_DEFAULT_PROJECT_ID)
466
+
467
+ headers = {
468
+ "Authorization": f"Bearer {access_token}",
469
+ "Content-Type": "application/json",
470
+ **ANTIGRAVITY_HEADERS,
471
+ }
472
+
473
+ # Wrap request for Antigravity API
474
+ import uuid
475
+
476
+ inner_payload = {
477
+ "model": f"models/{self.MODEL}",
478
+ "content": {"parts": [{"text": text}]},
479
+ "outputDimensionality": self.DIMENSION,
480
+ }
481
+
482
+ wrapped_payload = {
483
+ "project": project_id,
484
+ "model": self.MODEL,
485
+ "userAgent": "antigravity",
486
+ "requestId": f"embed-{uuid.uuid4()}",
487
+ "request": inner_payload,
488
+ }
489
+
490
+ # Try endpoints in order
491
+ last_error = None
492
+ async with httpx.AsyncClient(timeout=60.0) as client:
493
+ for endpoint in ANTIGRAVITY_ENDPOINTS:
494
+ api_url = f"{endpoint}/v1internal:embedContent"
495
+
496
+ try:
497
+ response = await client.post(
498
+ api_url,
499
+ headers=headers,
500
+ json=wrapped_payload,
501
+ )
502
+
503
+ if response.status_code in (401, 403):
504
+ last_error = Exception(f"{response.status_code} from {endpoint}")
505
+ continue
506
+
507
+ response.raise_for_status()
508
+ data = response.json()
509
+
510
+ # Extract embedding from response
511
+ inner_response = data.get("response", data)
512
+ embedding = inner_response.get("embedding", {})
513
+ values = embedding.get("values", [])
514
+
515
+ if values:
516
+ return values
517
+
518
+ raise ValueError(f"No embedding values in response: {data}")
519
+
520
+ except Exception as e:
521
+ last_error = e
522
+ continue
523
+
524
+ raise ValueError(f"All Antigravity endpoints failed for embeddings: {last_error}")
525
+
526
+
527
+ class OpenAIProvider(BaseEmbeddingProvider):
528
+ """OpenAI embedding provider using OAuth authentication."""
529
+
530
+ MODEL = "text-embedding-3-small"
531
+ DIMENSION = 1536
532
+
533
+ def __init__(self):
534
+ self._available: bool | None = None
535
+ self._token_store = None
536
+
537
+ def _get_token_store(self):
538
+ if self._token_store is None:
539
+ from ..auth.token_store import TokenStore
540
+
541
+ self._token_store = TokenStore()
542
+ return self._token_store
543
+
544
+ @property
545
+ def dimension(self) -> int:
546
+ return self.DIMENSION
547
+
548
+ @property
549
+ def name(self) -> str:
550
+ return "openai"
551
+
552
+ async def check_available(self) -> bool:
553
+ if self._available is not None:
554
+ return self._available
555
+
556
+ try:
557
+ token_store = self._get_token_store()
558
+ access_token = token_store.get_access_token("openai")
559
+
560
+ if not access_token:
561
+ print(
562
+ "⚠️ OpenAI not authenticated. Run: stravinsky-auth login openai",
563
+ file=sys.stderr,
564
+ )
565
+ self._available = False
566
+ return False
567
+
568
+ self._available = True
569
+ return True
570
+ except Exception as e:
571
+ print(f"⚠️ OpenAI not available: {e}", file=sys.stderr)
572
+ self._available = False
573
+ return False
574
+
575
+ async def get_embedding(self, text: str) -> list[float]:
576
+ token_store = self._get_token_store()
577
+ access_token = token_store.get_access_token("openai")
578
+
579
+ if not access_token:
580
+ raise ValueError("Not authenticated with OpenAI. Run: stravinsky-auth login openai")
581
+
582
+ httpx = get_httpx()
583
+
584
+ # Use standard OpenAI API for embeddings
585
+ api_url = "https://api.openai.com/v1/embeddings"
586
+
587
+ headers = {
588
+ "Authorization": f"Bearer {access_token}",
589
+ "Content-Type": "application/json",
590
+ }
591
+
592
+ payload = {
593
+ "model": self.MODEL,
594
+ "input": text,
595
+ }
596
+
597
+ async with httpx.AsyncClient(timeout=60.0) as client:
598
+ response = await client.post(api_url, headers=headers, json=payload)
599
+
600
+ if response.status_code == 401:
601
+ raise ValueError("OpenAI authentication failed. Run: stravinsky-auth login openai")
602
+
603
+ response.raise_for_status()
604
+ data = response.json()
605
+
606
+ # Extract embedding from response
607
+ embeddings = data.get("data", [])
608
+ if embeddings and "embedding" in embeddings[0]:
609
+ return embeddings[0]["embedding"]
610
+
611
+ raise ValueError(f"No embedding in response: {data}")
612
+
613
+
614
+ class MxbaiProvider(BaseEmbeddingProvider):
615
+ """Ollama local embedding provider using mxbai-embed-large (better for code).
616
+
617
+ mxbai-embed-large is a 1024-dimensional model optimized for code understanding.
618
+ It generally outperforms nomic-embed-text on code-related retrieval tasks.
619
+ """
620
+
621
+ MODEL = "mxbai-embed-large"
622
+ DIMENSION = 1024
623
+
624
+ def __init__(self):
625
+ self._available: bool | None = None
626
+
627
+ @property
628
+ def dimension(self) -> int:
629
+ return self.DIMENSION
630
+
631
+ @property
632
+ def name(self) -> str:
633
+ return "mxbai"
634
+
635
+ async def check_available(self) -> bool:
636
+ if self._available is not None:
637
+ return self._available
638
+
639
+ try:
640
+ ollama = get_ollama()
641
+ models = ollama.list()
642
+ model_names = [m.model for m in models.models] if hasattr(models, "models") else []
643
+
644
+ if not any(name and self.MODEL in name for name in model_names):
645
+ print(
646
+ f"⚠️ Embedding model '{self.MODEL}' not found. Run: ollama pull {self.MODEL}",
647
+ file=sys.stderr,
648
+ )
649
+ self._available = False
650
+ return False
651
+
652
+ self._available = True
653
+ return True
654
+ except Exception as e:
655
+ print(f"⚠️ Ollama not available: {e}. Start with: ollama serve", file=sys.stderr)
656
+ self._available = False
657
+ return False
658
+
659
+ async def get_embedding(self, text: str) -> list[float]:
660
+ ollama = get_ollama()
661
+ # mxbai-embed-large has 512 token context. Code can be 1-2 chars/token.
662
+ # Truncate to 2000 chars (~1000-2000 tokens) for safety margin
663
+ truncated = text[:2000] if len(text) > 2000 else text
664
+ response = ollama.embeddings(model=self.MODEL, prompt=truncated)
665
+ return response["embedding"]
666
+
667
+
668
+ class HuggingFaceProvider(BaseEmbeddingProvider):
669
+ """Hugging Face Inference API embedding provider.
670
+
671
+ Uses the Hugging Face Inference API for embeddings. Requires HF_TOKEN from:
672
+ 1. Environment variable: HF_TOKEN or HUGGING_FACE_HUB_TOKEN
673
+ 2. HF CLI config: ~/.cache/huggingface/token or ~/.huggingface/token
674
+
675
+ Default model: sentence-transformers/all-mpnet-base-v2 (768 dims, high quality)
676
+ """
677
+
678
+ DEFAULT_MODEL = "sentence-transformers/all-mpnet-base-v2"
679
+ DEFAULT_DIMENSION = 768
680
+
681
+ def __init__(self, model: str | None = None):
682
+ self._available: bool | None = None
683
+ self._model = model or self.DEFAULT_MODEL
684
+ # Dimension varies by model, but we'll use default for common models
685
+ self._dimension = self.DEFAULT_DIMENSION
686
+ self._token: str | None = None
687
+
688
+ @property
689
+ def dimension(self) -> int:
690
+ return self._dimension
691
+
692
+ @property
693
+ def name(self) -> str:
694
+ return "huggingface"
695
+
696
+ def _get_hf_token(self) -> str | None:
697
+ """Discover HF token from environment or CLI config."""
698
+ import os
699
+
700
+ # Check environment variables first
701
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
702
+ if token:
703
+ return token
704
+
705
+ # Check HF CLI config locations
706
+ hf_token_paths = [
707
+ Path.home() / ".cache" / "huggingface" / "token",
708
+ Path.home() / ".huggingface" / "token",
709
+ ]
710
+
711
+ for token_path in hf_token_paths:
712
+ if token_path.exists():
713
+ try:
714
+ return token_path.read_text().strip()
715
+ except Exception:
716
+ continue
717
+
718
+ return None
719
+
720
+ async def check_available(self) -> bool:
721
+ if self._available is not None:
722
+ return self._available
723
+
724
+ try:
725
+ self._token = self._get_hf_token()
726
+ if not self._token:
727
+ print(
728
+ "⚠️ Hugging Face token not found. Run: huggingface-cli login or set HF_TOKEN env var",
729
+ file=sys.stderr,
730
+ )
731
+ self._available = False
732
+ return False
733
+
734
+ self._available = True
735
+ return True
736
+ except Exception as e:
737
+ print(f"⚠️ Hugging Face not available: {e}", file=sys.stderr)
738
+ self._available = False
739
+ return False
740
+
741
+ @retry(
742
+ stop=stop_after_attempt(3),
743
+ wait=wait_exponential(multiplier=1, min=2, max=10),
744
+ retry=retry_if_exception_type(httpx.HTTPStatusError),
745
+ )
746
+ async def get_embedding(self, text: str) -> list[float]:
747
+ """Get embedding from HF Inference API with retry logic."""
748
+ if not self._token:
749
+ self._token = self._get_hf_token()
750
+ if not self._token:
751
+ raise ValueError(
752
+ "Hugging Face token not found. Run: huggingface-cli login or set HF_TOKEN"
753
+ )
754
+
755
+ httpx_client = get_httpx()
756
+
757
+ # HF Serverless Inference API endpoint
758
+ # Note: Free tier may have limited availability for some models
759
+ api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{self._model}"
760
+
761
+ headers = {
762
+ "Authorization": f"Bearer {self._token}",
763
+ }
764
+
765
+ # Truncate text to reasonable length (most models have 512 token limit)
766
+ # ~2000 chars ≈ 500 tokens for safety
767
+ truncated = text[:2000] if len(text) > 2000 else text
768
+
769
+ # HF Inference API accepts raw JSON with inputs field
770
+ payload = {"inputs": [truncated], "options": {"wait_for_model": True}}
771
+
772
+ async with httpx_client.AsyncClient(timeout=60.0) as client:
773
+ response = await client.post(api_url, headers=headers, json=payload)
774
+
775
+ # Handle specific error codes
776
+ if response.status_code == 401:
777
+ raise ValueError(
778
+ "Hugging Face authentication failed. Run: huggingface-cli login or set HF_TOKEN"
779
+ )
780
+ elif response.status_code == 410:
781
+ # Model removed from free tier
782
+ raise ValueError(
783
+ f"Model {self._model} is no longer available on HF free Inference API (410 Gone). "
784
+ "Try a different model or use Ollama for local embeddings instead."
785
+ )
786
+ elif response.status_code == 503:
787
+ # Model loading - retry will handle this
788
+ logger.info(f"Model {self._model} is loading, retrying...")
789
+ response.raise_for_status()
790
+ elif response.status_code == 429:
791
+ # Rate limit - retry will handle with exponential backoff
792
+ logger.warning("HF API rate limit hit, retrying with backoff...")
793
+ response.raise_for_status()
794
+
795
+ response.raise_for_status()
796
+
797
+ # Response is a single embedding vector (list of floats)
798
+ embedding = response.json()
799
+
800
+ # Handle different response formats
801
+ if isinstance(embedding, list):
802
+ # Direct embedding or batch with single item
803
+ if isinstance(embedding[0], (int, float)):
804
+ return embedding
805
+ elif isinstance(embedding[0], list):
806
+ # Batch response with single embedding
807
+ return embedding[0]
808
+
809
+ raise ValueError(f"Unexpected response format from HF API: {type(embedding)}")
810
+
811
+ async def embed_batch(self, texts: list[str]) -> list[list[float]]:
812
+ """Batch embedding support for HF API.
813
+
814
+ HF API supports batch requests, so we can send multiple texts at once.
815
+ """
816
+ if not texts:
817
+ return []
818
+
819
+ if not self._token:
820
+ self._token = self._get_hf_token()
821
+ if not self._token:
822
+ raise ValueError(
823
+ "Hugging Face token not found. Run: huggingface-cli login or set HF_TOKEN"
824
+ )
825
+
826
+ httpx_client = get_httpx()
827
+
828
+ # HF Serverless Inference API endpoint
829
+ api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{self._model}"
830
+
831
+ headers = {
832
+ "Authorization": f"Bearer {self._token}",
833
+ }
834
+
835
+ # Truncate all texts
836
+ truncated_texts = [text[:2000] if len(text) > 2000 else text for text in texts]
837
+
838
+ payload = {"inputs": truncated_texts, "options": {"wait_for_model": True}}
839
+
840
+ async with httpx_client.AsyncClient(timeout=120.0) as client:
841
+ response = await client.post(api_url, headers=headers, json=payload)
842
+
843
+ if response.status_code == 401:
844
+ raise ValueError(
845
+ "Hugging Face authentication failed. Run: huggingface-cli login or set HF_TOKEN"
846
+ )
847
+
848
+ response.raise_for_status()
849
+
850
+ embeddings = response.json()
851
+
852
+ # Response should be a list of embeddings
853
+ if isinstance(embeddings, list) and all(isinstance(e, list) for e in embeddings):
854
+ return embeddings
855
+
856
+ raise ValueError(f"Unexpected batch response format from HF API: {type(embeddings)}")
857
+
858
+
859
+ # Embedding provider instance cache
860
+ _embedding_provider_cache: dict[str, BaseEmbeddingProvider] = {}
861
+ _embedding_provider_lock = threading.Lock()
862
+
863
+
864
+ def get_embedding_provider(provider: EmbeddingProvider) -> BaseEmbeddingProvider:
865
+ """Factory function to get an embedding provider instance with caching."""
866
+ if provider not in _embedding_provider_cache:
867
+ with _embedding_provider_lock:
868
+ # Double-check pattern to avoid race condition
869
+ if provider not in _embedding_provider_cache:
870
+ providers = {
871
+ "ollama": OllamaProvider,
872
+ "mxbai": MxbaiProvider,
873
+ "gemini": GeminiProvider,
874
+ "openai": OpenAIProvider,
875
+ "huggingface": HuggingFaceProvider,
876
+ }
877
+
878
+ if provider not in providers:
879
+ raise ValueError(
880
+ f"Unknown provider: {provider}. Available: {list(providers.keys())}"
881
+ )
882
+
883
+ _embedding_provider_cache[provider] = providers[provider]()
884
+
885
+ return _embedding_provider_cache[provider]
886
+
887
+
888
+ class CodebaseVectorStore:
889
+ """
890
+ Persistent vector store for a single codebase.
891
+
892
+ Storage: ~/.stravinsky/vectordb/<project_hash>_<provider>/
893
+ Embedding: Configurable via provider (ollama, gemini, openai)
894
+ """
895
+
896
+ CHUNK_SIZE = 50 # lines per chunk
897
+ CHUNK_OVERLAP = 10 # lines of overlap between chunks
898
+
899
+ # File patterns to index
900
+ CODE_EXTENSIONS = {
901
+ ".py",
902
+ ".js",
903
+ ".ts",
904
+ ".tsx",
905
+ ".jsx",
906
+ ".go",
907
+ ".rs",
908
+ ".rb",
909
+ ".java",
910
+ ".c",
911
+ ".cpp",
912
+ ".h",
913
+ ".hpp",
914
+ ".cs",
915
+ ".swift",
916
+ ".kt",
917
+ ".scala",
918
+ ".vue",
919
+ ".svelte",
920
+ ".md",
921
+ ".txt",
922
+ ".yaml",
923
+ ".yml",
924
+ ".json",
925
+ ".toml",
926
+ }
927
+
928
+ # Directories to skip
929
+ SKIP_DIRS = {
930
+ "node_modules",
931
+ ".git",
932
+ "__pycache__",
933
+ ".venv",
934
+ "venv",
935
+ "env",
936
+ "dist",
937
+ "build",
938
+ ".next",
939
+ ".nuxt",
940
+ "target",
941
+ ".tox",
942
+ ".pytest_cache",
943
+ ".mypy_cache",
944
+ ".ruff_cache",
945
+ "coverage",
946
+ ".stravinsky",
947
+ "scratches",
948
+ "consoles",
949
+ ".idea",
950
+ }
951
+
952
+ def __init__(self, project_path: str, provider: EmbeddingProvider = "ollama"):
953
+ self.project_path = Path(project_path).resolve()
954
+ self.project_hash = hashlib.md5(str(self.project_path).encode()).hexdigest()[:12]
955
+
956
+ # Initialize embedding provider
957
+ self.provider_name = provider
958
+ self.provider = get_embedding_provider(provider)
959
+
960
+ # Store in user's home directory, separate by provider to avoid dimension mismatch
961
+ self.db_path = Path.home() / ".stravinsky" / "vectordb" / f"{self.project_hash}_{provider}"
962
+ self.db_path.mkdir(parents=True, exist_ok=True)
963
+
964
+ # File lock for single-process access to ChromaDB (prevents corruption)
965
+ self._lock_path = self.db_path / ".chromadb.lock"
966
+ self._file_lock = None
967
+
968
+ self._client = None
969
+ self._collection = None
970
+
971
+ # File watcher attributes
972
+ self._watcher: CodebaseFileWatcher | None = None
973
+ self._watcher_lock = threading.Lock()
974
+
975
+ # Cancellation flag for indexing operations
976
+ self._cancel_indexing = False
977
+ self._cancel_lock = threading.Lock()
978
+
979
+ @property
980
+ def file_lock(self):
981
+ """Get or create the file lock for this database.
982
+
983
+ Uses filelock to ensure single-process access to ChromaDB,
984
+ preventing database corruption from concurrent writes.
985
+ """
986
+ if self._file_lock is None:
987
+ filelock = get_filelock()
988
+ # Timeout of 30 seconds - if lock can't be acquired, raise error
989
+ self._file_lock = filelock.FileLock(str(self._lock_path), timeout=30)
990
+ return self._file_lock
991
+
992
+ @property
993
+ def client(self):
994
+ if self._client is None:
995
+ chromadb = get_chromadb()
996
+
997
+ # Check for stale lock before attempting acquisition
998
+ # Prevents 30s timeout from dead processes causing MCP "Connection closed" errors
999
+ if self._lock_path.exists():
1000
+ import time
1001
+
1002
+ lock_age = time.time() - self._lock_path.stat().st_mtime
1003
+ # Lock older than 60 seconds is likely from a crashed process
1004
+ # (Reduced from 300s to catch recently crashed processes)
1005
+ if lock_age > 60:
1006
+ logger.warning(
1007
+ f"Removing stale ChromaDB lock (age: {lock_age:.0f}s, path: {self._lock_path})"
1008
+ )
1009
+ try:
1010
+ self._lock_path.unlink(missing_ok=True)
1011
+ except Exception as e:
1012
+ logger.warning(f"Could not remove stale lock: {e}")
1013
+
1014
+ # Acquire lock before creating client to prevent concurrent access
1015
+ try:
1016
+ with self.file_lock: # Auto-releases on exit
1017
+ logger.debug(f"Acquired ChromaDB lock for {self.db_path}")
1018
+ self._client = chromadb.PersistentClient(path=str(self.db_path))
1019
+ except Exception as e:
1020
+ logger.warning(f"Could not acquire ChromaDB lock: {e}. Proceeding without lock.")
1021
+ self._client = chromadb.PersistentClient(path=str(self.db_path))
1022
+ return self._client
1023
+
1024
+ @property
1025
+ def collection(self):
1026
+ if self._collection is None:
1027
+ self._collection = self.client.get_or_create_collection(
1028
+ name="codebase", metadata={"hnsw:space": "cosine"}
1029
+ )
1030
+ return self._collection
1031
+
1032
+ async def check_embedding_service(self) -> bool:
1033
+ """Check if the embedding provider is available."""
1034
+ return await self.provider.check_available()
1035
+
1036
+ async def get_embedding(self, text: str) -> list[float]:
1037
+ """Get embedding vector for text using the configured provider."""
1038
+ return await self.provider.get_embedding(text)
1039
+
1040
+ async def get_embeddings_batch(
1041
+ self, texts: list[str], max_concurrent: int = 10
1042
+ ) -> list[list[float]]:
1043
+ """Get embeddings for multiple texts with parallel execution.
1044
+
1045
+ Uses asyncio.gather with semaphore-based concurrency control to avoid
1046
+ overwhelming the embedding service while maximizing throughput.
1047
+
1048
+ Args:
1049
+ texts: List of text strings to embed
1050
+ max_concurrent: Maximum concurrent embedding requests (default: 10)
1051
+
1052
+ Returns:
1053
+ List of embedding vectors in the same order as input texts.
1054
+ """
1055
+ import asyncio
1056
+
1057
+ if not texts:
1058
+ return []
1059
+
1060
+ # Use semaphore to limit concurrent requests
1061
+ semaphore = asyncio.Semaphore(max_concurrent)
1062
+
1063
+ async def get_with_semaphore(text: str, index: int) -> tuple[int, list[float]]:
1064
+ async with semaphore:
1065
+ emb = await self.get_embedding(text)
1066
+ return (index, emb)
1067
+
1068
+ # Launch all embedding requests concurrently (respecting semaphore)
1069
+ tasks = [get_with_semaphore(text, i) for i, text in enumerate(texts)]
1070
+ results = await asyncio.gather(*tasks)
1071
+
1072
+ # Sort by original index to maintain order
1073
+ sorted_results = sorted(results, key=lambda x: x[0])
1074
+ return [emb for _, emb in sorted_results]
1075
+
1076
+ def _chunk_file(self, file_path: Path) -> list[dict]:
1077
+ """Split a file into chunks with metadata.
1078
+
1079
+ Uses AST-aware chunking for Python files to respect function/class
1080
+ boundaries. Falls back to line-based chunking for other languages.
1081
+ """
1082
+ try:
1083
+ content = file_path.read_text(encoding="utf-8", errors="ignore")
1084
+ except Exception:
1085
+ return []
1086
+
1087
+ lines = content.split("\n")
1088
+ if len(lines) < 5: # Skip very small files
1089
+ return []
1090
+
1091
+ rel_path = str(file_path.resolve().relative_to(self.project_path.resolve()))
1092
+ language = file_path.suffix.lstrip(".")
1093
+
1094
+ # Use AST-aware chunking for Python files
1095
+ if language == "py":
1096
+ chunks = self._chunk_python_ast(content, rel_path, language)
1097
+ if chunks: # If AST parsing succeeded
1098
+ return chunks
1099
+
1100
+ # Fallback: line-based chunking for other languages or if AST fails
1101
+ return self._chunk_by_lines(lines, rel_path, language)
1102
+
1103
+ def _chunk_python_ast(self, content: str, rel_path: str, language: str) -> list[dict]:
1104
+ """Parse Python file and create chunks based on function/class boundaries.
1105
+
1106
+ Each function, method, and class becomes its own chunk, preserving
1107
+ semantic boundaries for better embedding quality.
1108
+ """
1109
+ import ast
1110
+
1111
+ try:
1112
+ tree = ast.parse(content)
1113
+ except SyntaxError:
1114
+ return [] # Fall back to line-based chunking
1115
+
1116
+ lines = content.split("\n")
1117
+ chunks = []
1118
+
1119
+ def get_docstring(node: ast.AST) -> str:
1120
+ """Extract docstring from a node if present."""
1121
+ if (
1122
+ isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef))
1123
+ and node.body
1124
+ ):
1125
+ first = node.body[0]
1126
+ if isinstance(first, ast.Expr) and isinstance(first.value, ast.Constant):
1127
+ if isinstance(first.value.value, str):
1128
+ return first.value.value
1129
+ return ""
1130
+
1131
+ def get_decorators(
1132
+ node: ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef,
1133
+ ) -> list[str]:
1134
+ """Extract decorator names from a node."""
1135
+ decorators = []
1136
+ for dec in node.decorator_list:
1137
+ if isinstance(dec, ast.Name):
1138
+ decorators.append(f"@{dec.id}")
1139
+ elif isinstance(dec, ast.Attribute):
1140
+ decorators.append(f"@{ast.unparse(dec)}")
1141
+ elif isinstance(dec, ast.Call):
1142
+ if isinstance(dec.func, ast.Name):
1143
+ decorators.append(f"@{dec.func.id}")
1144
+ elif isinstance(dec.func, ast.Attribute):
1145
+ decorators.append(f"@{ast.unparse(dec.func)}")
1146
+ return decorators
1147
+
1148
+ def get_base_classes(node: ast.ClassDef) -> list[str]:
1149
+ """Extract base class names from a class definition."""
1150
+ bases = []
1151
+ for base in node.bases:
1152
+ if isinstance(base, ast.Name):
1153
+ bases.append(base.id)
1154
+ elif isinstance(base, ast.Attribute):
1155
+ bases.append(ast.unparse(base))
1156
+ else:
1157
+ bases.append(ast.unparse(base))
1158
+ return bases
1159
+
1160
+ def get_return_type(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
1161
+ """Extract return type annotation from a function."""
1162
+ if node.returns:
1163
+ return ast.unparse(node.returns)
1164
+ return ""
1165
+
1166
+ def get_parameters(node: ast.FunctionDef | ast.AsyncFunctionDef) -> list[str]:
1167
+ """Extract parameter signatures from a function."""
1168
+ params = []
1169
+ for arg in node.args.args:
1170
+ param = arg.arg
1171
+ if arg.annotation:
1172
+ param += f": {ast.unparse(arg.annotation)}"
1173
+ params.append(param)
1174
+ return params
1175
+
1176
+ def add_chunk(
1177
+ node: ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef,
1178
+ node_type: str,
1179
+ name: str,
1180
+ parent_class: str | None = None,
1181
+ ) -> None:
1182
+ """Add a chunk for a function/class node."""
1183
+ start_line = node.lineno
1184
+ end_line = node.end_lineno or start_line
1185
+
1186
+ # Extract the source code for this node
1187
+ chunk_lines = lines[start_line - 1 : end_line]
1188
+ chunk_text = "\n".join(chunk_lines)
1189
+ content_hash = hashlib.md5(chunk_text.encode("utf-8")).hexdigest()[:12]
1190
+
1191
+ # Skip very small chunks
1192
+ if len(chunk_lines) < 3:
1193
+ return
1194
+
1195
+ # Build descriptive header
1196
+ docstring = get_docstring(node)
1197
+ if parent_class:
1198
+ header = f"File: {rel_path}\n{node_type}: {parent_class}.{name}\nLines: {start_line}-{end_line}"
1199
+ else:
1200
+ header = f"File: {rel_path}\n{node_type}: {name}\nLines: {start_line}-{end_line}"
1201
+
1202
+ if docstring:
1203
+ header += f"\nDocstring: {docstring[:200]}..."
1204
+
1205
+ document = f"{header}\n\n{chunk_text}"
1206
+
1207
+ chunks.append(
1208
+ {
1209
+ "id": f"{rel_path}:{start_line}-{end_line}:{content_hash}",
1210
+ "document": document,
1211
+ "metadata": {
1212
+ "file_path": rel_path,
1213
+ "start_line": start_line,
1214
+ "end_line": end_line,
1215
+ "language": language,
1216
+ "node_type": node_type.lower(),
1217
+ "name": f"{parent_class}.{name}" if parent_class else name,
1218
+ # Structural metadata for filtering
1219
+ "decorators": ",".join(get_decorators(node)),
1220
+ "is_async": isinstance(node, ast.AsyncFunctionDef),
1221
+ # Class-specific metadata
1222
+ "base_classes": ",".join(get_base_classes(node))
1223
+ if isinstance(node, ast.ClassDef)
1224
+ else "",
1225
+ # Function-specific metadata
1226
+ "return_type": get_return_type(node)
1227
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
1228
+ else "",
1229
+ "parameters": ",".join(get_parameters(node))
1230
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
1231
+ else "",
1232
+ },
1233
+ }
1234
+ )
1235
+
1236
+ # Walk the AST and extract functions/classes
1237
+ for node in ast.walk(tree):
1238
+ if isinstance(node, ast.ClassDef):
1239
+ add_chunk(node, "Class", node.name)
1240
+ # Also add methods as separate chunks for granular search
1241
+ for item in node.body:
1242
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
1243
+ add_chunk(item, "Method", item.name, parent_class=node.name)
1244
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
1245
+ # Only top-level functions (not methods)
1246
+ # Check if this function is inside a class body
1247
+ is_method = False
1248
+ for parent in ast.walk(tree):
1249
+ if isinstance(parent, ast.ClassDef):
1250
+ body = getattr(parent, "body", None)
1251
+ if isinstance(body, list) and node in body:
1252
+ is_method = True
1253
+ break
1254
+ if not is_method:
1255
+ add_chunk(node, "Function", node.name)
1256
+
1257
+ # If we found no functions/classes, chunk module-level code
1258
+ if not chunks and len(lines) >= 5:
1259
+ # Add module-level chunk for imports and constants
1260
+ module_chunk = "\n".join(lines[: min(50, len(lines))])
1261
+ chunks.append(
1262
+ {
1263
+ "id": f"{rel_path}:1-{min(50, len(lines))}",
1264
+ "document": f"File: {rel_path}\nModule-level code\nLines: 1-{min(50, len(lines))}\n\n{module_chunk}",
1265
+ "metadata": {
1266
+ "file_path": rel_path,
1267
+ "start_line": 1,
1268
+ "end_line": min(50, len(lines)),
1269
+ "language": language,
1270
+ "node_type": "module",
1271
+ "name": rel_path,
1272
+ },
1273
+ }
1274
+ )
1275
+
1276
+ return chunks
1277
+
1278
+ def _chunk_by_lines(self, lines: list[str], rel_path: str, language: str) -> list[dict]:
1279
+ """Fallback line-based chunking with overlap."""
1280
+ chunks = []
1281
+
1282
+ for i in range(0, len(lines), self.CHUNK_SIZE - self.CHUNK_OVERLAP):
1283
+ chunk_lines = lines[i : i + self.CHUNK_SIZE]
1284
+ if len(chunk_lines) < 5: # Skip tiny trailing chunks
1285
+ continue
1286
+
1287
+ chunk_text = "\n".join(chunk_lines)
1288
+ content_hash = hashlib.md5(chunk_text.encode("utf-8")).hexdigest()[:12]
1289
+ start_line = i + 1
1290
+ end_line = i + len(chunk_lines)
1291
+
1292
+ # Create a searchable document with context
1293
+ document = f"File: {rel_path}\nLines: {start_line}-{end_line}\n\n{chunk_text}"
1294
+
1295
+ chunks.append(
1296
+ {
1297
+ "id": f"{rel_path}:{start_line}-{end_line}:{content_hash}",
1298
+ "document": document,
1299
+ "metadata": {
1300
+ "file_path": rel_path,
1301
+ "start_line": start_line,
1302
+ "end_line": end_line,
1303
+ "language": language,
1304
+ },
1305
+ }
1306
+ )
1307
+
1308
+ return chunks
1309
+
1310
+ def _get_files_to_index(self) -> list[Path]:
1311
+ """Get all indexable files in the project."""
1312
+ files = []
1313
+ for file_path in self.project_path.rglob("*"):
1314
+ if file_path.is_file():
1315
+ # Skip files outside project boundaries (symlink traversal protection)
1316
+ try:
1317
+ resolved_file = file_path.resolve()
1318
+ resolved_project = self.project_path.resolve()
1319
+
1320
+ # Check if file is under project using parent chain with samefile()
1321
+ # This handles macOS /var → /private/var aliasing and symlinks
1322
+ found = False
1323
+ current = resolved_file.parent
1324
+ while current != current.parent: # Stop at filesystem root
1325
+ try:
1326
+ if current.samefile(resolved_project):
1327
+ found = True
1328
+ break
1329
+ except OSError:
1330
+ # samefile can fail on some filesystems; try string comparison
1331
+ if current == resolved_project:
1332
+ found = True
1333
+ break
1334
+ current = current.parent
1335
+
1336
+ if not found:
1337
+ continue # Outside project
1338
+ except (ValueError, OSError):
1339
+ continue # Outside project boundaries
1340
+
1341
+ # Skip hidden files and directories
1342
+ if any(
1343
+ part.startswith(".") for part in file_path.parts[len(self.project_path.parts) :]
1344
+ ) and file_path.suffix not in {".md", ".txt"}: # Allow .github docs
1345
+ continue
1346
+
1347
+ # Skip excluded directories
1348
+ if any(skip_dir in file_path.parts for skip_dir in self.SKIP_DIRS):
1349
+ continue
1350
+
1351
+ # Only include code files
1352
+ if file_path.suffix.lower() in self.CODE_EXTENSIONS:
1353
+ files.append(file_path)
1354
+
1355
+ return files
1356
+
1357
+ def request_cancel_indexing(self) -> None:
1358
+ """Request cancellation of ongoing indexing operation.
1359
+
1360
+ Sets a flag that will be checked between batches. The operation will
1361
+ stop gracefully after completing the current batch.
1362
+ """
1363
+ with self._cancel_lock:
1364
+ self._cancel_indexing = True
1365
+ logger.info(f"Cancellation requested for {self.project_path}")
1366
+
1367
+ def clear_cancel_flag(self) -> None:
1368
+ """Clear the cancellation flag."""
1369
+ with self._cancel_lock:
1370
+ self._cancel_indexing = False
1371
+
1372
+ def is_cancellation_requested(self) -> bool:
1373
+ """Check if cancellation has been requested."""
1374
+ with self._cancel_lock:
1375
+ return self._cancel_indexing
1376
+
1377
+ async def index_codebase(self, force: bool = False) -> dict:
1378
+ """
1379
+ Index the entire codebase into the vector store.
1380
+
1381
+ This operation can be cancelled by calling request_cancel_indexing().
1382
+ Cancellation happens between batches, so the current batch will complete.
1383
+
1384
+ Args:
1385
+ force: If True, reindex everything. Otherwise, only index new/changed files.
1386
+
1387
+ Returns:
1388
+ Statistics about the indexing operation.
1389
+ """
1390
+ import time
1391
+
1392
+ # Clear any previous cancellation requests
1393
+ self.clear_cancel_flag()
1394
+
1395
+ # Start timing
1396
+ start_time = time.time()
1397
+
1398
+ print(f"🔍 SEMANTIC-INDEX: {self.project_path}", file=sys.stderr)
1399
+
1400
+ # Notify reindex start (non-blocking)
1401
+ notifier = None # Initialize to avoid NameError in error handlers
1402
+ try:
1403
+ from mcp_bridge.notifications import get_notification_manager
1404
+
1405
+ notifier = get_notification_manager()
1406
+ await notifier.notify_reindex_start(str(self.project_path))
1407
+ except Exception as e:
1408
+ logger.warning(f"Failed to send reindex start notification: {e}")
1409
+
1410
+ try:
1411
+ if not await self.check_embedding_service():
1412
+ error_msg = "Embedding service not available"
1413
+ # Notify error
1414
+ try:
1415
+ if notifier:
1416
+ await notifier.notify_reindex_error(error_msg)
1417
+ except Exception as e:
1418
+ logger.warning(f"Failed to send reindex error notification: {e}")
1419
+ return {"error": error_msg, "indexed": 0}
1420
+
1421
+ # Get existing document IDs
1422
+ existing_ids = set()
1423
+ try:
1424
+ # Only fetch IDs to minimize overhead
1425
+ existing = self.collection.get(include=[])
1426
+ existing_ids = set(existing["ids"]) if existing["ids"] else set()
1427
+ except Exception:
1428
+ pass
1429
+
1430
+ if force:
1431
+ # Clear existing collection
1432
+ try:
1433
+ self.client.delete_collection("codebase")
1434
+ self._collection = None
1435
+ existing_ids = set()
1436
+ except Exception:
1437
+ pass
1438
+
1439
+ files = self._get_files_to_index()
1440
+ all_chunks = []
1441
+ current_chunk_ids = set()
1442
+
1443
+ # Mark: Generate all chunks for current codebase
1444
+ for file_path in files:
1445
+ chunks = self._chunk_file(file_path)
1446
+ all_chunks.extend(chunks)
1447
+ for c in chunks:
1448
+ current_chunk_ids.add(c["id"])
1449
+
1450
+ # Sweep: Identify stale chunks to remove
1451
+ to_delete = existing_ids - current_chunk_ids
1452
+
1453
+ # Identify new chunks to add
1454
+ to_add_ids = current_chunk_ids - existing_ids
1455
+ chunks_to_add = [c for c in all_chunks if c["id"] in to_add_ids]
1456
+
1457
+ # Prune stale chunks
1458
+ if to_delete:
1459
+ print(f" Pruning {len(to_delete)} stale chunks...", file=sys.stderr)
1460
+ self.collection.delete(ids=list(to_delete))
1461
+
1462
+ if not chunks_to_add:
1463
+ stats = {
1464
+ "indexed": 0,
1465
+ "pruned": len(to_delete),
1466
+ "total_files": len(files),
1467
+ "message": "No new chunks to index",
1468
+ "time_taken": round(time.time() - start_time, 1),
1469
+ }
1470
+ # Notify completion
1471
+ try:
1472
+ if notifier:
1473
+ await notifier.notify_reindex_complete(stats)
1474
+ except Exception as e:
1475
+ logger.warning(f"Failed to send reindex complete notification: {e}")
1476
+ return stats
1477
+
1478
+ # Batch embed and store
1479
+ batch_size = 50
1480
+ total_indexed = 0
1481
+
1482
+ for i in range(0, len(chunks_to_add), batch_size):
1483
+ # Check for cancellation between batches
1484
+ if self.is_cancellation_requested():
1485
+ print(f" ⚠️ Indexing cancelled after {total_indexed} chunks", file=sys.stderr)
1486
+ stats = {
1487
+ "indexed": total_indexed,
1488
+ "pruned": len(to_delete),
1489
+ "total_files": len(files),
1490
+ "db_path": str(self.db_path),
1491
+ "time_taken": round(time.time() - start_time, 1),
1492
+ "cancelled": True,
1493
+ "message": f"Cancelled after {total_indexed}/{len(chunks_to_add)} chunks",
1494
+ }
1495
+ # Notify cancellation
1496
+ try:
1497
+ if notifier:
1498
+ await notifier.notify_reindex_error(
1499
+ f"Indexing cancelled by user after {total_indexed} chunks"
1500
+ )
1501
+ except Exception as e:
1502
+ logger.warning(f"Failed to send cancellation notification: {e}")
1503
+ return stats
1504
+
1505
+ batch = chunks_to_add[i : i + batch_size]
1506
+
1507
+ documents = [c["document"] for c in batch]
1508
+ embeddings = await self.get_embeddings_batch(documents)
1509
+
1510
+ self.collection.add(
1511
+ ids=[c["id"] for c in batch],
1512
+ documents=documents,
1513
+ embeddings=embeddings, # type: ignore[arg-type]
1514
+ metadatas=[c["metadata"] for c in batch],
1515
+ )
1516
+ total_indexed += len(batch)
1517
+ print(f" Indexed {total_indexed}/{len(chunks_to_add)} chunks...", file=sys.stderr)
1518
+
1519
+ stats = {
1520
+ "indexed": total_indexed,
1521
+ "pruned": len(to_delete),
1522
+ "total_files": len(files),
1523
+ "db_path": str(self.db_path),
1524
+ "time_taken": round(time.time() - start_time, 1),
1525
+ }
1526
+
1527
+ # Notify completion
1528
+ try:
1529
+ if notifier:
1530
+ await notifier.notify_reindex_complete(stats)
1531
+ except Exception as e:
1532
+ logger.warning(f"Failed to send reindex complete notification: {e}")
1533
+
1534
+ return stats
1535
+
1536
+ except Exception as e:
1537
+ error_msg = str(e)
1538
+ logger.error(f"Reindexing failed: {error_msg}")
1539
+
1540
+ # Notify error
1541
+ try:
1542
+ if notifier:
1543
+ await notifier.notify_reindex_error(error_msg)
1544
+ except Exception as notify_error:
1545
+ logger.warning(f"Failed to send reindex error notification: {notify_error}")
1546
+
1547
+ raise
1548
+
1549
+ async def search(
1550
+ self,
1551
+ query: str,
1552
+ n_results: int = 10,
1553
+ language: str | None = None,
1554
+ node_type: str | None = None,
1555
+ decorator: str | None = None,
1556
+ is_async: bool | None = None,
1557
+ base_class: str | None = None,
1558
+ ) -> list[dict]:
1559
+ """
1560
+ Search the codebase with a natural language query.
1561
+
1562
+ Args:
1563
+ query: Natural language search query
1564
+ n_results: Maximum number of results to return
1565
+ language: Filter by language (e.g., "py", "ts", "js")
1566
+ node_type: Filter by node type (e.g., "function", "class", "method")
1567
+ decorator: Filter by decorator (e.g., "@property", "@staticmethod")
1568
+ is_async: Filter by async status (True = async only, False = sync only)
1569
+ base_class: Filter by base class (e.g., "BaseClass")
1570
+
1571
+ Returns:
1572
+ List of matching code chunks with metadata.
1573
+ """
1574
+ filters = []
1575
+ if language:
1576
+ filters.append(f"language={language}")
1577
+ if node_type:
1578
+ filters.append(f"node_type={node_type}")
1579
+ if decorator:
1580
+ filters.append(f"decorator={decorator}")
1581
+ if is_async is not None:
1582
+ filters.append(f"is_async={is_async}")
1583
+ if base_class:
1584
+ filters.append(f"base_class={base_class}")
1585
+ filter_str = f" [{', '.join(filters)}]" if filters else ""
1586
+ print(f"🔎 SEMANTIC-SEARCH: '{query[:50]}...'{filter_str}", file=sys.stderr)
1587
+
1588
+ if not await self.check_embedding_service():
1589
+ return [{"error": "Embedding service not available"}]
1590
+
1591
+ # Check if collection has documents
1592
+ try:
1593
+ count = self.collection.count()
1594
+ if count == 0:
1595
+ return [{"error": "No documents indexed", "hint": "Run index_codebase first"}]
1596
+ except Exception as e:
1597
+ return [{"error": f"Collection error: {e}"}]
1598
+
1599
+ # Get query embedding
1600
+ query_embedding = await self.get_embedding(query)
1601
+
1602
+ # Build where clause for metadata filtering
1603
+ where_filters = []
1604
+ if language:
1605
+ where_filters.append({"language": language})
1606
+ if node_type:
1607
+ where_filters.append({"node_type": node_type.lower()})
1608
+ if decorator:
1609
+ # ChromaDB $like for substring match in comma-separated field
1610
+ # Use % wildcards for pattern matching
1611
+ where_filters.append({"decorators": {"$like": f"%{decorator}%"}})
1612
+ if is_async is not None:
1613
+ where_filters.append({"is_async": is_async})
1614
+ if base_class:
1615
+ # Use $like for substring match
1616
+ where_filters.append({"base_classes": {"$like": f"%{base_class}%"}})
1617
+
1618
+ where_clause = None
1619
+ if len(where_filters) == 1:
1620
+ where_clause = where_filters[0]
1621
+ elif len(where_filters) > 1:
1622
+ where_clause = {"$and": where_filters}
1623
+
1624
+ # Search with optional filtering
1625
+ query_kwargs: dict = {
1626
+ "query_embeddings": [query_embedding],
1627
+ "n_results": n_results,
1628
+ "include": ["documents", "metadatas", "distances"],
1629
+ }
1630
+ if where_clause:
1631
+ query_kwargs["where"] = where_clause
1632
+
1633
+ results = self.collection.query(**query_kwargs)
1634
+
1635
+ # Format results
1636
+ formatted = []
1637
+ if results["ids"] and results["ids"][0]:
1638
+ for i, _doc_id in enumerate(results["ids"][0]):
1639
+ metadata = results["metadatas"][0][i] if results["metadatas"] else {}
1640
+ distance = results["distances"][0][i] if results["distances"] else 0
1641
+ document = results["documents"][0][i] if results["documents"] else ""
1642
+
1643
+ # Extract just the code part (skip file/line header)
1644
+ code_lines = document.split("\n\n", 1)
1645
+ code = code_lines[1] if len(code_lines) > 1 else document
1646
+
1647
+ formatted.append(
1648
+ {
1649
+ "file": metadata.get("file_path", "unknown"),
1650
+ "lines": f"{metadata.get('start_line', '?')}-{metadata.get('end_line', '?')}",
1651
+ "language": metadata.get("language", ""),
1652
+ "relevance": round(1 - distance, 3), # Convert distance to similarity
1653
+ "code_preview": code[:500] + "..." if len(code) > 500 else code,
1654
+ }
1655
+ )
1656
+
1657
+ return formatted
1658
+
1659
+ def get_stats(self) -> dict:
1660
+ """Get statistics about the vector store."""
1661
+ try:
1662
+ count = self.collection.count()
1663
+ return {
1664
+ "project_path": str(self.project_path),
1665
+ "db_path": str(self.db_path),
1666
+ "chunks_indexed": count,
1667
+ "embedding_provider": self.provider.name,
1668
+ "embedding_dimension": self.provider.dimension,
1669
+ }
1670
+ except Exception as e:
1671
+ return {"error": str(e)}
1672
+
1673
+ def start_watching(self, debounce_seconds: float = 2.0) -> "CodebaseFileWatcher":
1674
+ """Start watching the project directory for file changes.
1675
+
1676
+ Args:
1677
+ debounce_seconds: Time to wait before reindexing after changes (default: 2.0s)
1678
+
1679
+ Returns:
1680
+ The CodebaseFileWatcher instance
1681
+ """
1682
+ with self._watcher_lock:
1683
+ if self._watcher is None:
1684
+ # Avoid circular import by importing here
1685
+ self._watcher = CodebaseFileWatcher(
1686
+ project_path=self.project_path,
1687
+ store=self,
1688
+ debounce_seconds=debounce_seconds,
1689
+ )
1690
+ self._watcher.start()
1691
+ else:
1692
+ if not self._watcher.is_running():
1693
+ self._watcher.start()
1694
+ else:
1695
+ logger.warning(f"Watcher for {self.project_path} is already running")
1696
+ return self._watcher
1697
+
1698
+ def stop_watching(self) -> bool:
1699
+ """Stop watching the project directory.
1700
+
1701
+ Returns:
1702
+ True if watcher was stopped, False if no watcher was active
1703
+ """
1704
+ with self._watcher_lock:
1705
+ if self._watcher is not None:
1706
+ self._watcher.stop()
1707
+ self._watcher = None
1708
+ return True
1709
+ return False
1710
+
1711
+ def is_watching(self) -> bool:
1712
+ """Check if the project directory is being watched.
1713
+
1714
+ Returns:
1715
+ True if watcher is active and running, False otherwise
1716
+ """
1717
+ with self._watcher_lock:
1718
+ if self._watcher is not None:
1719
+ return self._watcher.is_running()
1720
+ return False
1721
+
1722
+
1723
+ # --- Module-level API for MCP tools ---
1724
+
1725
+ _stores: dict[str, CodebaseVectorStore] = {}
1726
+ _stores_lock = threading.Lock()
1727
+
1728
+ # Module-level watcher management
1729
+ _watchers: dict[str, "CodebaseFileWatcher"] = {}
1730
+ _watchers_lock = threading.Lock()
1731
+
1732
+
1733
+ def _cleanup_watchers():
1734
+ """Cleanup function to stop all watchers on exit.
1735
+
1736
+ Registered with atexit to ensure graceful shutdown when Python exits normally.
1737
+ Note: This won't be called if the process is killed (SIGKILL) or crashes.
1738
+ """
1739
+ with _watchers_lock:
1740
+ for path, watcher in list(_watchers.items()):
1741
+ try:
1742
+ logger.debug(f"Stopping watcher for {path} on exit")
1743
+ watcher.stop()
1744
+ except Exception as e:
1745
+ logger.warning(f"Error stopping watcher for {path}: {e}")
1746
+
1747
+
1748
+ # Register cleanup handler for graceful shutdown
1749
+ atexit.register(_cleanup_watchers)
1750
+
1751
+
1752
+ def get_store(project_path: str, provider: EmbeddingProvider = "ollama") -> CodebaseVectorStore:
1753
+ """Get or create a vector store for a project.
1754
+
1755
+ Note: Cache key includes provider to prevent cross-provider conflicts
1756
+ (different providers have different embedding dimensions).
1757
+ """
1758
+ path = str(Path(project_path).resolve())
1759
+ cache_key = f"{path}:{provider}"
1760
+ if cache_key not in _stores:
1761
+ with _stores_lock:
1762
+ # Double-check pattern to avoid race condition
1763
+ if cache_key not in _stores:
1764
+ _stores[cache_key] = CodebaseVectorStore(path, provider)
1765
+ return _stores[cache_key]
1766
+
1767
+
1768
+ async def semantic_search(
1769
+ query: str,
1770
+ project_path: str = ".",
1771
+ n_results: int = 10,
1772
+ language: str | None = None,
1773
+ node_type: str | None = None,
1774
+ decorator: str | None = None,
1775
+ is_async: bool | None = None,
1776
+ base_class: str | None = None,
1777
+ provider: EmbeddingProvider = "ollama",
1778
+ ) -> str:
1779
+ """
1780
+ Search codebase with natural language query.
1781
+
1782
+ Args:
1783
+ query: Natural language search query (e.g., "find authentication logic")
1784
+ project_path: Path to the project root
1785
+ n_results: Maximum number of results to return
1786
+ language: Filter by language (e.g., "py", "ts", "js")
1787
+ node_type: Filter by node type (e.g., "function", "class", "method")
1788
+ decorator: Filter by decorator (e.g., "@property", "@staticmethod")
1789
+ is_async: Filter by async status (True = async only, False = sync only)
1790
+ base_class: Filter by base class (e.g., "BaseClass")
1791
+ provider: Embedding provider (ollama, mxbai, gemini, openai, huggingface)
1792
+
1793
+ Returns:
1794
+ Formatted search results with file paths and code snippets.
1795
+ """
1796
+ store = get_store(project_path, provider)
1797
+ results = await store.search(
1798
+ query,
1799
+ n_results,
1800
+ language,
1801
+ node_type,
1802
+ decorator=decorator,
1803
+ is_async=is_async,
1804
+ base_class=base_class,
1805
+ )
1806
+
1807
+ if not results:
1808
+ return "No results found"
1809
+
1810
+ if "error" in results[0]:
1811
+ return f"Error: {results[0]['error']}\nHint: {results[0].get('hint', 'Check Ollama is running')}"
1812
+
1813
+ lines = [f"Found {len(results)} results for: '{query}'\n"]
1814
+ for i, r in enumerate(results, 1):
1815
+ lines.append(f"{i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
1816
+ lines.append(f"```{r['language']}")
1817
+ lines.append(r["code_preview"])
1818
+ lines.append("```\n")
1819
+
1820
+ return "\n".join(lines)
1821
+
1822
+
1823
+ async def hybrid_search(
1824
+ query: str,
1825
+ pattern: str | None = None,
1826
+ project_path: str = ".",
1827
+ n_results: int = 10,
1828
+ language: str | None = None,
1829
+ node_type: str | None = None,
1830
+ decorator: str | None = None,
1831
+ is_async: bool | None = None,
1832
+ base_class: str | None = None,
1833
+ provider: EmbeddingProvider = "ollama",
1834
+ ) -> str:
1835
+ """
1836
+ Hybrid search combining semantic similarity with structural AST matching.
1837
+
1838
+ Performs semantic search first, then optionally filters/boosts results
1839
+ that also match an ast-grep structural pattern.
1840
+
1841
+ Args:
1842
+ query: Natural language search query (e.g., "find authentication logic")
1843
+ pattern: Optional ast-grep pattern for structural matching (e.g., "def $FUNC($$$):")
1844
+ project_path: Path to the project root
1845
+ n_results: Maximum number of results to return
1846
+ language: Filter by language (e.g., "py", "ts", "js")
1847
+ node_type: Filter by node type (e.g., "function", "class", "method")
1848
+ decorator: Filter by decorator (e.g., "@property", "@staticmethod")
1849
+ is_async: Filter by async status (True = async only, False = sync only)
1850
+ base_class: Filter by base class (e.g., "BaseClass")
1851
+ provider: Embedding provider (ollama, gemini, openai)
1852
+
1853
+ Returns:
1854
+ Formatted search results with relevance scores and structural match indicators.
1855
+ """
1856
+ from mcp_bridge.tools.code_search import ast_grep_search
1857
+
1858
+ # Get semantic results (fetch more if we're going to filter)
1859
+ fetch_count = n_results * 2 if pattern else n_results
1860
+ semantic_result = await semantic_search(
1861
+ query=query,
1862
+ project_path=project_path,
1863
+ n_results=fetch_count,
1864
+ language=language,
1865
+ node_type=node_type,
1866
+ decorator=decorator,
1867
+ is_async=is_async,
1868
+ base_class=base_class,
1869
+ provider=provider,
1870
+ )
1871
+
1872
+ if not pattern:
1873
+ return semantic_result
1874
+
1875
+ if semantic_result.startswith("Error:") or semantic_result == "No results found":
1876
+ return semantic_result
1877
+
1878
+ # Get structural matches from ast-grep
1879
+ ast_result = await ast_grep_search(
1880
+ pattern=pattern,
1881
+ directory=project_path,
1882
+ language=language or "",
1883
+ )
1884
+
1885
+ # Extract file paths from ast-grep results
1886
+ ast_files: set[str] = set()
1887
+ if ast_result and not ast_result.startswith("Error:") and ast_result != "No matches found":
1888
+ for line in ast_result.split("\n"):
1889
+ if line.startswith("- "):
1890
+ # Format: "- file.py:123"
1891
+ file_part = line[2:].split(":")[0]
1892
+ ast_files.add(file_part)
1893
+
1894
+ if not ast_files:
1895
+ # No structural matches, return semantic results with note
1896
+ return f"{semantic_result}\n\n[Note: No structural matches for pattern '{pattern}']"
1897
+
1898
+ # Parse semantic results and boost/annotate files that appear in both
1899
+ lines = []
1900
+ result_lines = semantic_result.split("\n")
1901
+ header = result_lines[0] if result_lines else ""
1902
+ lines.append(header.replace("results for:", "hybrid results for:"))
1903
+ lines.append(f"[Structural pattern: {pattern}]\n")
1904
+
1905
+ i = 1
1906
+ boosted_count = 0
1907
+ while i < len(result_lines):
1908
+ line = result_lines[i]
1909
+ if line and (line[0].isdigit() or line.startswith("```") or line.strip()):
1910
+ # Check if this is a result header line (e.g., "1. file.py:10-20")
1911
+ if line and line[0].isdigit() and "." in line:
1912
+ file_part = line.split()[1].split(":")[0] if len(line.split()) > 1 else ""
1913
+ if file_part in ast_files:
1914
+ lines.append(f"{line} 🎯 [structural match]")
1915
+ boosted_count += 1
1916
+ else:
1917
+ lines.append(line)
1918
+ else:
1919
+ lines.append(line)
1920
+ else:
1921
+ lines.append(line)
1922
+ i += 1
1923
+
1924
+ lines.append(
1925
+ f"\n[{boosted_count}/{len(ast_files)} semantic results also match structural pattern]"
1926
+ )
1927
+
1928
+ return "\n".join(lines)
1929
+
1930
+
1931
+ async def index_codebase(
1932
+ project_path: str = ".",
1933
+ force: bool = False,
1934
+ provider: EmbeddingProvider = "ollama",
1935
+ ) -> str:
1936
+ """
1937
+ Index a codebase for semantic search.
1938
+
1939
+ Args:
1940
+ project_path: Path to the project root
1941
+ force: If True, reindex everything. Otherwise, only new/changed files.
1942
+ provider: Embedding provider - ollama (local/free), mxbai (local/free),
1943
+ gemini (cloud/OAuth), openai (cloud/OAuth), huggingface (cloud/token)
1944
+
1945
+ Returns:
1946
+ Indexing statistics.
1947
+ """
1948
+ store = get_store(project_path, provider)
1949
+ stats = await store.index_codebase(force=force)
1950
+
1951
+ if "error" in stats:
1952
+ return f"Error: {stats['error']}"
1953
+
1954
+ if stats.get("cancelled"):
1955
+ return (
1956
+ f"⚠️ Indexing cancelled\n"
1957
+ f"Indexed {stats['indexed']} chunks from {stats['total_files']} files before cancellation\n"
1958
+ f"{stats.get('message', '')}"
1959
+ )
1960
+
1961
+ return (
1962
+ f"Indexed {stats['indexed']} chunks from {stats['total_files']} files\n"
1963
+ f"Database: {stats.get('db_path', 'unknown')}\n"
1964
+ f"{stats.get('message', '')}"
1965
+ )
1966
+
1967
+
1968
+ def cancel_indexing(
1969
+ project_path: str = ".",
1970
+ provider: EmbeddingProvider = "ollama",
1971
+ ) -> str:
1972
+ """
1973
+ Cancel an ongoing indexing operation.
1974
+
1975
+ The cancellation happens gracefully between batches - the current batch
1976
+ will complete before the operation stops.
1977
+
1978
+ Args:
1979
+ project_path: Path to the project root
1980
+ provider: Embedding provider (must match the one used for indexing)
1981
+
1982
+ Returns:
1983
+ Confirmation message.
1984
+ """
1985
+ try:
1986
+ store = get_store(project_path, provider)
1987
+ store.request_cancel_indexing()
1988
+ return f"✅ Cancellation requested for {project_path}\nIndexing will stop after current batch completes."
1989
+ except Exception as e:
1990
+ return f"❌ Error requesting cancellation: {e}"
1991
+
1992
+
1993
+ async def semantic_stats(
1994
+ project_path: str = ".",
1995
+ provider: EmbeddingProvider = "ollama",
1996
+ ) -> str:
1997
+ """
1998
+ Get statistics about the semantic search index.
1999
+
2000
+ Args:
2001
+ project_path: Path to the project root
2002
+ provider: Embedding provider - ollama (local/free), mxbai (local/free),
2003
+ gemini (cloud/OAuth), openai (cloud/OAuth), huggingface (cloud/token)
2004
+
2005
+ Returns:
2006
+ Index statistics.
2007
+ """
2008
+ store = get_store(project_path, provider)
2009
+ stats = store.get_stats()
2010
+
2011
+ if "error" in stats:
2012
+ return f"Error: {stats['error']}"
2013
+
2014
+ return (
2015
+ f"Project: {stats['project_path']}\n"
2016
+ f"Database: {stats['db_path']}\n"
2017
+ f"Chunks indexed: {stats['chunks_indexed']}\n"
2018
+ f"Embedding provider: {stats['embedding_provider']} ({stats['embedding_dimension']} dims)"
2019
+ )
2020
+
2021
+
2022
+ def delete_index(
2023
+ project_path: str = ".",
2024
+ provider: EmbeddingProvider | None = None,
2025
+ delete_all: bool = False,
2026
+ ) -> str:
2027
+ """
2028
+ Delete semantic search index for a project.
2029
+
2030
+ Args:
2031
+ project_path: Path to the project root
2032
+ provider: Embedding provider (if None and delete_all=False, deletes all providers for this project)
2033
+ delete_all: If True, delete ALL indexes for ALL projects (ignores project_path and provider)
2034
+
2035
+ Returns:
2036
+ Confirmation message with deleted paths.
2037
+ """
2038
+ import shutil
2039
+
2040
+ vectordb_base = Path.home() / ".stravinsky" / "vectordb"
2041
+
2042
+ if not vectordb_base.exists():
2043
+ return "✅ No semantic search indexes found (vectordb directory doesn't exist)"
2044
+
2045
+ if delete_all:
2046
+ # Delete entire vectordb directory
2047
+ try:
2048
+ shutil.rmtree(vectordb_base)
2049
+ return "✅ Deleted all semantic search indexes for all projects"
2050
+ except Exception as e:
2051
+ return f"❌ Error deleting all indexes: {e}"
2052
+
2053
+ # Generate project hash
2054
+ project_path_resolved = Path(project_path).resolve()
2055
+ project_hash = hashlib.md5(str(project_path_resolved).encode()).hexdigest()[:12]
2056
+
2057
+ deleted = []
2058
+ errors = []
2059
+
2060
+ if provider:
2061
+ # Delete specific provider index for this project
2062
+ index_path = vectordb_base / f"{project_hash}_{provider}"
2063
+ if index_path.exists():
2064
+ try:
2065
+ shutil.rmtree(index_path)
2066
+ deleted.append(str(index_path))
2067
+ except Exception as e:
2068
+ errors.append(f"{provider}: {e}")
2069
+ else:
2070
+ errors.append(f"{provider}: Index not found")
2071
+ else:
2072
+ # Delete all provider indexes for this project
2073
+ providers: list[EmbeddingProvider] = ["ollama", "mxbai", "gemini", "openai", "huggingface"]
2074
+ for prov in providers:
2075
+ index_path = vectordb_base / f"{project_hash}_{prov}"
2076
+ if index_path.exists():
2077
+ try:
2078
+ shutil.rmtree(index_path)
2079
+ deleted.append(str(index_path))
2080
+ except Exception as e:
2081
+ errors.append(f"{prov}: {e}")
2082
+
2083
+ if not deleted and not errors:
2084
+ return f"⚠️ No indexes found for project: {project_path_resolved}\nProject hash: {project_hash}"
2085
+
2086
+ result = []
2087
+ if deleted:
2088
+ result.append(f"✅ Deleted {len(deleted)} index(es):")
2089
+ for path in deleted:
2090
+ result.append(f" - {path}")
2091
+ if errors:
2092
+ result.append(f"\n❌ Errors ({len(errors)}):")
2093
+ for error in errors:
2094
+ result.append(f" - {error}")
2095
+
2096
+ return "\n".join(result)
2097
+
2098
+
2099
+ async def semantic_health(project_path: str = ".", provider: EmbeddingProvider = "ollama") -> str:
2100
+ """Check health of semantic search system."""
2101
+ store = get_store(project_path, provider)
2102
+
2103
+ status = []
2104
+
2105
+ # Check Provider
2106
+ try:
2107
+ is_avail = await store.check_embedding_service()
2108
+ status.append(
2109
+ f"Provider ({store.provider.name}): {'✅ Online' if is_avail else '❌ Offline'}"
2110
+ )
2111
+ except Exception as e:
2112
+ status.append(f"Provider ({store.provider.name}): ❌ Error - {e}")
2113
+
2114
+ # Check DB
2115
+ try:
2116
+ count = store.collection.count()
2117
+ status.append(f"Vector DB: ✅ Online ({count} documents)")
2118
+ except Exception as e:
2119
+ status.append(f"Vector DB: ❌ Error - {e}")
2120
+
2121
+ return "\n".join(status)
2122
+
2123
+
2124
+ # ========================
2125
+ # FILE WATCHER MANAGEMENT
2126
+ # ========================
2127
+
2128
+
2129
+ async def start_file_watcher(
2130
+ project_path: str,
2131
+ provider: EmbeddingProvider = "ollama",
2132
+ debounce_seconds: float = 2.0,
2133
+ ) -> "CodebaseFileWatcher":
2134
+ """Start watching a project directory for file changes.
2135
+
2136
+ If an index exists, automatically performs an incremental reindex to catch up
2137
+ on any changes that happened while the watcher was not running.
2138
+
2139
+ Args:
2140
+ project_path: Path to the project root
2141
+ provider: Embedding provider to use for reindexing
2142
+ debounce_seconds: Time to wait before reindexing after changes
2143
+
2144
+ Returns:
2145
+ The started CodebaseFileWatcher instance
2146
+ """
2147
+ path = str(Path(project_path).resolve())
2148
+ with _watchers_lock:
2149
+ if path not in _watchers:
2150
+ store = get_store(project_path, provider)
2151
+
2152
+ # Check if index exists - CRITICAL: Must have index before watching
2153
+ try:
2154
+ stats = store.get_stats()
2155
+ chunks_indexed = stats.get("chunks_indexed", 0)
2156
+ if chunks_indexed == 0:
2157
+ raise ValueError(
2158
+ f"No semantic index found for '{path}'. "
2159
+ f"Run semantic_index(project_path='{path}', provider='{provider}') "
2160
+ f"before starting the file watcher."
2161
+ )
2162
+
2163
+ # Index exists - catch up on any missed changes
2164
+ print(f"📋 Catching up on changes since last index...", file=sys.stderr)
2165
+ await store.index_codebase(force=False)
2166
+ print(f"✅ Index updated, starting file watcher", file=sys.stderr)
2167
+
2168
+ except ValueError:
2169
+ # Re-raise ValueError (our intentional error)
2170
+ raise
2171
+ except Exception as e:
2172
+ # Collection doesn't exist or other error
2173
+ raise ValueError(
2174
+ f"No semantic index found for '{path}'. "
2175
+ f"Run semantic_index(project_path='{path}', provider='{provider}') "
2176
+ f"before starting the file watcher."
2177
+ ) from e
2178
+
2179
+ watcher = store.start_watching(debounce_seconds=debounce_seconds)
2180
+ _watchers[path] = watcher
2181
+ else:
2182
+ watcher = _watchers[path]
2183
+ if not watcher.is_running():
2184
+ watcher.start()
2185
+ return _watchers[path]
2186
+
2187
+
2188
+ def stop_file_watcher(project_path: str) -> bool:
2189
+ """Stop watching a project directory.
2190
+
2191
+ Args:
2192
+ project_path: Path to the project root
2193
+
2194
+ Returns:
2195
+ True if watcher was stopped, False if no watcher was active
2196
+ """
2197
+ path = str(Path(project_path).resolve())
2198
+ with _watchers_lock:
2199
+ if path in _watchers:
2200
+ watcher = _watchers[path]
2201
+ watcher.stop()
2202
+ del _watchers[path]
2203
+ return True
2204
+ return False
2205
+
2206
+
2207
+ def get_file_watcher(project_path: str) -> "CodebaseFileWatcher | None":
2208
+ """Get an active file watcher for a project.
2209
+
2210
+ Args:
2211
+ project_path: Path to the project root
2212
+
2213
+ Returns:
2214
+ The CodebaseFileWatcher if active, None otherwise
2215
+ """
2216
+ path = str(Path(project_path).resolve())
2217
+ with _watchers_lock:
2218
+ watcher = _watchers.get(path)
2219
+ if watcher is not None and watcher.is_running():
2220
+ return watcher
2221
+ return None
2222
+
2223
+
2224
+ def list_file_watchers() -> list[dict]:
2225
+ """List all active file watchers.
2226
+
2227
+ Returns:
2228
+ List of dicts with watcher info (project_path, debounce_seconds, provider, status)
2229
+ """
2230
+ with _watchers_lock:
2231
+ watchers_info = []
2232
+ for path, watcher in _watchers.items():
2233
+ watchers_info.append(
2234
+ {
2235
+ "project_path": path,
2236
+ "debounce_seconds": watcher.debounce_seconds,
2237
+ "provider": watcher.store.provider_name,
2238
+ "status": "running" if watcher.is_running() else "stopped",
2239
+ }
2240
+ )
2241
+ return watchers_info
2242
+
2243
+
2244
+ # ========================
2245
+ # MULTI-QUERY EXPANSION & DECOMPOSITION
2246
+ # ========================
2247
+
2248
+
2249
+ async def _expand_query_with_llm(query: str, num_variations: int = 3) -> list[str]:
2250
+ """
2251
+ Use LLM to rephrase a query into multiple semantic variations.
2252
+
2253
+ For example: "database connection" -> ["SQLAlchemy engine setup",
2254
+ "connect to postgres", "db session management"]
2255
+
2256
+ Args:
2257
+ query: Original search query
2258
+ num_variations: Number of variations to generate (default: 3)
2259
+
2260
+ Returns:
2261
+ List of query variations including the original
2262
+ """
2263
+ from mcp_bridge.tools.model_invoke import invoke_gemini
2264
+
2265
+ prompt = f"""You are a code search query expander. Given a search query, generate {num_variations} alternative phrasings that would help find relevant code.
2266
+
2267
+ Original query: "{query}"
2268
+
2269
+ Generate {num_variations} alternative queries that:
2270
+ 1. Use different technical terminology (e.g., "database" -> "SQLAlchemy", "ORM", "connection pool")
2271
+ 2. Reference specific implementations or patterns
2272
+ 3. Include related concepts that might appear in code
2273
+
2274
+ Return ONLY the alternative queries, one per line. No numbering, no explanations.
2275
+ Example output for "database connection":
2276
+ SQLAlchemy engine configuration
2277
+ postgres connection setup
2278
+ db session factory pattern"""
2279
+
2280
+ try:
2281
+ result = await invoke_gemini(
2282
+ token_store=TokenStore(),
2283
+ prompt=prompt,
2284
+ model="gemini-2.0-flash",
2285
+ temperature=0.7,
2286
+ max_tokens=200,
2287
+ )
2288
+
2289
+ # Parse variations from response
2290
+ variations = [line.strip() for line in result.strip().split("\n") if line.strip()]
2291
+ # Always include original query first
2292
+ all_queries = [query] + variations[:num_variations]
2293
+ return all_queries
2294
+
2295
+ except Exception as e:
2296
+ logger.warning(f"Query expansion failed: {e}, using original query only")
2297
+ return [query]
2298
+
2299
+
2300
+ async def _decompose_query_with_llm(query: str) -> list[str]:
2301
+ """
2302
+ Break a complex query into smaller, focused sub-questions.
2303
+
2304
+ For example: "Initialize the DB and then create a user model" ->
2305
+ ["database initialization", "user model definition"]
2306
+
2307
+ Args:
2308
+ query: Complex search query
2309
+
2310
+ Returns:
2311
+ List of sub-queries, or [query] if decomposition not needed
2312
+ """
2313
+ from mcp_bridge.tools.model_invoke import invoke_gemini
2314
+
2315
+ prompt = f"""You are a code search query analyzer. Determine if this query should be broken into sub-queries.
2316
+
2317
+ Query: "{query}"
2318
+
2319
+ If the query contains multiple distinct concepts (connected by "and", "then", "also", etc.),
2320
+ break it into separate focused sub-queries.
2321
+
2322
+ If the query is already focused on a single concept, return just that query.
2323
+
2324
+ Return ONLY the sub-queries, one per line. No numbering, no explanations.
2325
+
2326
+ Examples:
2327
+ - "Initialize the DB and then create a user model" ->
2328
+ database initialization
2329
+ user model definition
2330
+
2331
+ - "authentication logic" ->
2332
+ authentication logic"""
2333
+
2334
+ try:
2335
+ result = await invoke_gemini(
2336
+ token_store=TokenStore(),
2337
+ prompt=prompt,
2338
+ model="gemini-2.0-flash",
2339
+ temperature=0.3, # Lower temperature for more consistent decomposition
2340
+ max_tokens=150,
2341
+ )
2342
+
2343
+ # Parse sub-queries from response
2344
+ sub_queries = [line.strip() for line in result.strip().split("\n") if line.strip()]
2345
+ return sub_queries if sub_queries else [query]
2346
+
2347
+ except Exception as e:
2348
+ logger.warning(f"Query decomposition failed: {e}, using original query")
2349
+ return [query]
2350
+
2351
+
2352
+ def _aggregate_results(
2353
+ all_results: list[list[dict]],
2354
+ n_results: int = 10,
2355
+ ) -> list[dict]:
2356
+ """
2357
+ Aggregate and deduplicate results from multiple queries.
2358
+
2359
+ Uses reciprocal rank fusion to combine relevance scores from different queries.
2360
+
2361
+ Args:
2362
+ all_results: List of result lists from different queries
2363
+ n_results: Maximum number of results to return
2364
+
2365
+ Returns:
2366
+ Deduplicated and re-ranked results
2367
+ """
2368
+ # Track seen files to avoid duplicates
2369
+ seen_files: dict[str, dict] = {} # file:lines -> result with best score
2370
+ file_scores: dict[str, float] = {} # file:lines -> aggregated score
2371
+
2372
+ # Reciprocal Rank Fusion constant
2373
+ k = 60
2374
+
2375
+ for _query_idx, results in enumerate(all_results):
2376
+ for rank, result in enumerate(results):
2377
+ file_key = f"{result.get('file', '')}:{result.get('lines', '')}"
2378
+
2379
+ # RRF score contribution
2380
+ rrf_score = 1 / (k + rank + 1)
2381
+
2382
+ if file_key not in seen_files:
2383
+ seen_files[file_key] = result.copy()
2384
+ file_scores[file_key] = rrf_score
2385
+ else:
2386
+ # Aggregate scores
2387
+ file_scores[file_key] += rrf_score
2388
+ # Keep higher original relevance if available
2389
+ if result.get("relevance", 0) > seen_files[file_key].get("relevance", 0):
2390
+ seen_files[file_key] = result.copy()
2391
+
2392
+ # Sort by aggregated score and return top N
2393
+ sorted_keys = sorted(file_scores.keys(), key=lambda k: file_scores[k], reverse=True)
2394
+
2395
+ aggregated = []
2396
+ for key in sorted_keys[:n_results]:
2397
+ result = seen_files[key]
2398
+ # Update relevance to reflect aggregated score (normalized)
2399
+ max_score = max(file_scores.values()) if file_scores else 1
2400
+ result["relevance"] = round(file_scores[key] / max_score, 3)
2401
+ aggregated.append(result)
2402
+
2403
+ return aggregated
2404
+
2405
+
2406
+ async def multi_query_search(
2407
+ query: str,
2408
+ project_path: str = ".",
2409
+ n_results: int = 10,
2410
+ num_expansions: int = 3,
2411
+ language: str | None = None,
2412
+ node_type: str | None = None,
2413
+ provider: EmbeddingProvider = "ollama",
2414
+ ) -> str:
2415
+ """
2416
+ Search with LLM-expanded query variations for better recall.
2417
+
2418
+ Rephrases the query into multiple semantic variations, searches for each,
2419
+ and aggregates results using reciprocal rank fusion.
2420
+
2421
+ Args:
2422
+ query: Natural language search query
2423
+ project_path: Path to the project root
2424
+ n_results: Maximum number of results to return
2425
+ num_expansions: Number of query variations to generate (default: 3)
2426
+ language: Filter by language (e.g., "py", "ts")
2427
+ node_type: Filter by node type (e.g., "function", "class")
2428
+ provider: Embedding provider
2429
+
2430
+ Returns:
2431
+ Formatted search results with relevance scores.
2432
+ """
2433
+ import asyncio
2434
+
2435
+ print(f"🔍 MULTI-QUERY: Expanding '{query[:50]}...'", file=sys.stderr)
2436
+
2437
+ # Get query expansions
2438
+ expanded_queries = await _expand_query_with_llm(query, num_expansions)
2439
+ print(f" Generated {len(expanded_queries)} query variations", file=sys.stderr)
2440
+
2441
+ # Get store once
2442
+ store = get_store(project_path, provider)
2443
+
2444
+ # Search with all queries in parallel
2445
+ async def search_single(q: str) -> list[dict]:
2446
+ return await store.search(
2447
+ q,
2448
+ n_results=n_results, # Get full results for each query
2449
+ language=language,
2450
+ node_type=node_type,
2451
+ )
2452
+
2453
+ all_results = await asyncio.gather(*[search_single(q) for q in expanded_queries])
2454
+
2455
+ # Filter out error results
2456
+ valid_results = [r for r in all_results if r and "error" not in r[0]]
2457
+
2458
+ if not valid_results:
2459
+ if all_results and all_results[0] and "error" in all_results[0][0]:
2460
+ return f"Error: {all_results[0][0]['error']}"
2461
+ return "No results found"
2462
+
2463
+ # Aggregate results
2464
+ aggregated = _aggregate_results(valid_results, n_results)
2465
+
2466
+ if not aggregated:
2467
+ return "No results found"
2468
+
2469
+ # Format output
2470
+ lines = [f"Found {len(aggregated)} results for multi-query expansion of: '{query}'"]
2471
+ lines.append(
2472
+ f"[Expanded to: {', '.join(q[:30] + '...' if len(q) > 30 else q for q in expanded_queries)}]\n"
2473
+ )
2474
+
2475
+ for i, r in enumerate(aggregated, 1):
2476
+ lines.append(f"{i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
2477
+ lines.append(f"```{r.get('language', '')}")
2478
+ lines.append(r.get("code_preview", ""))
2479
+ lines.append("```\n")
2480
+
2481
+ return "\n".join(lines)
2482
+
2483
+
2484
+ async def decomposed_search(
2485
+ query: str,
2486
+ project_path: str = ".",
2487
+ n_results: int = 10,
2488
+ language: str | None = None,
2489
+ node_type: str | None = None,
2490
+ provider: EmbeddingProvider = "ollama",
2491
+ ) -> str:
2492
+ """
2493
+ Search by decomposing complex queries into focused sub-questions.
2494
+
2495
+ Breaks multi-part queries like "Initialize the DB and create a user model"
2496
+ into separate searches, returning organized results for each part.
2497
+
2498
+ Args:
2499
+ query: Complex search query (may contain multiple concepts)
2500
+ project_path: Path to the project root
2501
+ n_results: Maximum results per sub-query
2502
+ language: Filter by language
2503
+ node_type: Filter by node type
2504
+ provider: Embedding provider
2505
+
2506
+ Returns:
2507
+ Formatted results organized by sub-question.
2508
+ """
2509
+ import asyncio
2510
+
2511
+ print(f"🔍 DECOMPOSED-SEARCH: Analyzing '{query[:50]}...'", file=sys.stderr)
2512
+
2513
+ # Decompose query
2514
+ sub_queries = await _decompose_query_with_llm(query)
2515
+ print(f" Decomposed into {len(sub_queries)} sub-queries", file=sys.stderr)
2516
+
2517
+ if len(sub_queries) == 1 and sub_queries[0] == query:
2518
+ # No decomposition needed, use regular search
2519
+ return await semantic_search(
2520
+ query=query,
2521
+ project_path=project_path,
2522
+ n_results=n_results,
2523
+ language=language,
2524
+ node_type=node_type,
2525
+ provider=provider,
2526
+ )
2527
+
2528
+ # Get store once
2529
+ store = get_store(project_path, provider)
2530
+
2531
+ # Search each sub-query in parallel
2532
+ async def search_sub(q: str) -> tuple[str, list[dict]]:
2533
+ results = await store.search(
2534
+ q,
2535
+ n_results=n_results // len(sub_queries) + 2, # Distribute results
2536
+ language=language,
2537
+ node_type=node_type,
2538
+ )
2539
+ return (q, results)
2540
+
2541
+ sub_results = await asyncio.gather(*[search_sub(q) for q in sub_queries])
2542
+
2543
+ # Format output with sections for each sub-query
2544
+ lines = [f"Decomposed search for: '{query}'"]
2545
+ lines.append(f"[Split into {len(sub_queries)} sub-queries]\n")
2546
+
2547
+ total_results = 0
2548
+ for sub_query, results in sub_results:
2549
+ lines.append(f"### {sub_query}")
2550
+
2551
+ if not results or (results and "error" in results[0]):
2552
+ lines.append(" No results found\n")
2553
+ continue
2554
+
2555
+ for i, r in enumerate(results[:5], 1): # Limit per sub-query
2556
+ lines.append(f" {i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
2557
+ # Shorter preview for decomposed results
2558
+ preview = r.get("code_preview", "")[:200]
2559
+ if len(r.get("code_preview", "")) > 200:
2560
+ preview += "..."
2561
+ lines.append(f" ```{r.get('language', '')}")
2562
+ lines.append(f" {preview}")
2563
+ lines.append(" ```")
2564
+ total_results += 1
2565
+ lines.append("")
2566
+
2567
+ lines.append(f"[Total: {total_results} results across {len(sub_queries)} sub-queries]")
2568
+
2569
+ return "\n".join(lines)
2570
+
2571
+
2572
+ async def enhanced_search(
2573
+ query: str,
2574
+ project_path: str = ".",
2575
+ n_results: int = 10,
2576
+ mode: str = "auto",
2577
+ language: str | None = None,
2578
+ node_type: str | None = None,
2579
+ provider: EmbeddingProvider = "ollama",
2580
+ ) -> str:
2581
+ """
2582
+ Unified enhanced search combining expansion and decomposition.
2583
+
2584
+ Automatically selects the best strategy based on query complexity:
2585
+ - Simple queries: Multi-query expansion for better recall
2586
+ - Complex queries: Decomposition + expansion for comprehensive coverage
2587
+
2588
+ Args:
2589
+ query: Search query (simple or complex)
2590
+ project_path: Path to the project root
2591
+ n_results: Maximum number of results
2592
+ mode: Search mode - "auto", "expand", "decompose", or "both"
2593
+ language: Filter by language
2594
+ node_type: Filter by node type
2595
+ provider: Embedding provider
2596
+
2597
+ Returns:
2598
+ Formatted search results.
2599
+ """
2600
+ # Use classifier for intelligent mode selection
2601
+ classification = classify_query(query)
2602
+ logger.debug(
2603
+ f"Query classified as {classification.category.value} "
2604
+ f"(confidence: {classification.confidence:.2f}, suggested: {classification.suggested_tool})"
2605
+ )
2606
+
2607
+ # Determine mode based on classification
2608
+ if mode == "auto":
2609
+ # HYBRID → decompose (complex multi-part queries)
2610
+ # SEMANTIC → expand (conceptual queries benefit from variations)
2611
+ # PATTERN/STRUCTURAL → expand (simple queries, quick path)
2612
+ mode = "decompose" if classification.category == QueryCategory.HYBRID else "expand"
2613
+
2614
+ if mode == "decompose":
2615
+ return await decomposed_search(
2616
+ query=query,
2617
+ project_path=project_path,
2618
+ n_results=n_results,
2619
+ language=language,
2620
+ node_type=node_type,
2621
+ provider=provider,
2622
+ )
2623
+ elif mode == "expand":
2624
+ return await multi_query_search(
2625
+ query=query,
2626
+ project_path=project_path,
2627
+ n_results=n_results,
2628
+ language=language,
2629
+ node_type=node_type,
2630
+ provider=provider,
2631
+ )
2632
+ elif mode == "both":
2633
+ # Decompose first, then expand each sub-query
2634
+ sub_queries = await _decompose_query_with_llm(query)
2635
+
2636
+ all_results: list[list[dict]] = []
2637
+ store = get_store(project_path, provider)
2638
+
2639
+ for sub_q in sub_queries:
2640
+ # Expand each sub-query
2641
+ expanded = await _expand_query_with_llm(sub_q, num_variations=2)
2642
+ for exp_q in expanded:
2643
+ results = await store.search(
2644
+ exp_q,
2645
+ n_results=5,
2646
+ language=language,
2647
+ node_type=node_type,
2648
+ )
2649
+ if results and "error" not in results[0]:
2650
+ all_results.append(results)
2651
+
2652
+ aggregated = _aggregate_results(all_results, n_results)
2653
+
2654
+ if not aggregated:
2655
+ return "No results found"
2656
+
2657
+ lines = [f"Enhanced search (decompose+expand) for: '{query}'"]
2658
+ lines.append(f"[{len(sub_queries)} sub-queries × expansions]\n")
2659
+
2660
+ for i, r in enumerate(aggregated, 1):
2661
+ lines.append(f"{i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
2662
+ lines.append(f"```{r.get('language', '')}")
2663
+ lines.append(r.get("code_preview", ""))
2664
+ lines.append("```\n")
2665
+
2666
+ return "\n".join(lines)
2667
+
2668
+ else:
2669
+ return f"Unknown mode: {mode}. Use 'auto', 'expand', 'decompose', or 'both'"
2670
+
2671
+
2672
+ # ========================
2673
+ # FILE WATCHER IMPLEMENTATION
2674
+ # ========================
2675
+
2676
+
2677
+ class CodebaseFileWatcher:
2678
+ """Watch a project directory for file changes and trigger reindexing.
2679
+
2680
+ Features:
2681
+ - Watches for file create, modify, delete, move events
2682
+ - Filters to .py files only
2683
+ - Skips hidden files and directories (., .git, __pycache__, venv, etc.)
2684
+ - Debounces rapid changes to batch them into a single reindex
2685
+ - Thread-safe with daemon threads for clean shutdown
2686
+ - Integrates with CodebaseVectorStore for incremental indexing
2687
+ """
2688
+
2689
+ # Default debounce time in seconds
2690
+ DEFAULT_DEBOUNCE_SECONDS = 2.0
2691
+
2692
+ def __init__(
2693
+ self,
2694
+ project_path: Path | str,
2695
+ store: CodebaseVectorStore,
2696
+ debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS,
2697
+ ):
2698
+ """Initialize the file watcher.
2699
+
2700
+ Args:
2701
+ project_path: Path to the project root to watch
2702
+ store: CodebaseVectorStore instance for reindexing
2703
+ debounce_seconds: Time to wait before reindexing after changes (default: 2.0s)
2704
+ """
2705
+ self.project_path = Path(project_path).resolve()
2706
+ self.store = store
2707
+ self.debounce_seconds = debounce_seconds
2708
+
2709
+ # Observer and handler for watchdog
2710
+ self._observer = None
2711
+ self._event_handler = None
2712
+
2713
+ # Thread safety
2714
+ self._lock = threading.Lock()
2715
+ self._running = False
2716
+
2717
+ # Debouncing
2718
+ self._pending_reindex_timer: threading.Timer | None = None
2719
+ self._pending_files: set[Path] = set()
2720
+ self._pending_lock = threading.Lock()
2721
+
2722
+ def start(self) -> None:
2723
+ """Start watching the project directory.
2724
+
2725
+ Creates and starts a watchdog observer in a daemon thread.
2726
+ """
2727
+ with self._lock:
2728
+ if self._running:
2729
+ logger.warning(f"Watcher for {self.project_path} is already running")
2730
+ return
2731
+
2732
+ try:
2733
+ watchdog = get_watchdog()
2734
+ Observer = watchdog["Observer"]
2735
+
2736
+ # Create event handler class and instantiate
2737
+ FileChangeHandler = _create_file_change_handler_class()
2738
+ self._event_handler = FileChangeHandler(
2739
+ project_path=self.project_path,
2740
+ watcher=self,
2741
+ )
2742
+
2743
+ # Create and start observer (daemon mode for clean shutdown)
2744
+ self._observer = Observer()
2745
+ self._observer.daemon = True
2746
+ self._observer.schedule(
2747
+ self._event_handler,
2748
+ str(self.project_path),
2749
+ recursive=True,
2750
+ )
2751
+ self._observer.start()
2752
+ self._running = True
2753
+ logger.info(f"File watcher started for {self.project_path}")
2754
+
2755
+ except Exception as e:
2756
+ logger.error(f"Failed to start file watcher: {e}")
2757
+ self._running = False
2758
+ raise
2759
+
2760
+ def stop(self) -> None:
2761
+ """Stop watching the project directory.
2762
+
2763
+ Cancels any pending reindex timers and stops the observer.
2764
+ """
2765
+ with self._lock:
2766
+ # Cancel pending reindex
2767
+ if self._pending_reindex_timer is not None:
2768
+ self._pending_reindex_timer.cancel()
2769
+ self._pending_reindex_timer = None
2770
+
2771
+ # Stop observer
2772
+ if self._observer is not None:
2773
+ self._observer.stop()
2774
+ self._observer.join(timeout=5) # Wait up to 5 seconds for shutdown
2775
+ self._observer = None
2776
+
2777
+ self._event_handler = None
2778
+ self._running = False
2779
+ logger.info(f"File watcher stopped for {self.project_path}")
2780
+
2781
+ def is_running(self) -> bool:
2782
+ """Check if the watcher is currently running.
2783
+
2784
+ Returns:
2785
+ True if watcher is active, False otherwise
2786
+ """
2787
+ with self._lock:
2788
+ return self._running and self._observer is not None and self._observer.is_alive()
2789
+
2790
+ def _on_file_changed(self, file_path: Path) -> None:
2791
+ """Called when a file changes (internal use by _FileChangeHandler).
2792
+
2793
+ Accumulates files and triggers debounced reindex.
2794
+
2795
+ Args:
2796
+ file_path: Path to the changed file
2797
+ """
2798
+ with self._pending_lock:
2799
+ self._pending_files.add(file_path)
2800
+
2801
+ # Cancel previous timer
2802
+ if self._pending_reindex_timer is not None:
2803
+ self._pending_reindex_timer.cancel()
2804
+
2805
+ # Start new timer
2806
+ self._pending_reindex_timer = self._create_debounce_timer()
2807
+ self._pending_reindex_timer.start()
2808
+
2809
+ def _create_debounce_timer(self) -> threading.Timer:
2810
+ """Create a new debounce timer for reindexing.
2811
+
2812
+ Returns:
2813
+ A threading.Timer configured for debounce reindexing
2814
+ """
2815
+ return threading.Timer(
2816
+ self.debounce_seconds,
2817
+ self._trigger_reindex,
2818
+ )
2819
+
2820
+ def _trigger_reindex(self) -> None:
2821
+ """Trigger reindexing of accumulated changed files.
2822
+
2823
+ This is called after the debounce period expires. It performs an
2824
+ incremental reindex focusing on the changed files.
2825
+ """
2826
+ import asyncio
2827
+
2828
+ with self._pending_lock:
2829
+ if not self._pending_files:
2830
+ self._pending_reindex_timer = None
2831
+ return
2832
+
2833
+ files_to_index = list(self._pending_files)
2834
+ self._pending_files.clear()
2835
+ self._pending_reindex_timer = None
2836
+
2837
+ # Run async reindex in a new event loop
2838
+ try:
2839
+ loop = asyncio.new_event_loop()
2840
+ asyncio.set_event_loop(loop)
2841
+ try:
2842
+ loop.run_until_complete(self.store.index_codebase(force=False))
2843
+ logger.debug(f"Reindexed {len(files_to_index)} changed files")
2844
+ finally:
2845
+ loop.close()
2846
+ except Exception as e:
2847
+ logger.error(f"Error during file watcher reindex: {e}")
2848
+
2849
+
2850
+ def _create_file_change_handler_class():
2851
+ """Create FileChangeHandler class that inherits from FileSystemEventHandler.
2852
+
2853
+ This is a factory function that creates the handler class dynamically
2854
+ after watchdog is imported, allowing for lazy loading.
2855
+ """
2856
+ watchdog = get_watchdog()
2857
+ FileSystemEventHandler = watchdog["FileSystemEventHandler"]
2858
+
2859
+ class _FileChangeHandler(FileSystemEventHandler):
2860
+ """Watchdog event handler for file system changes.
2861
+
2862
+ Detects file create, modify, delete, and move events, filters them,
2863
+ and notifies the watcher of relevant changes.
2864
+ """
2865
+
2866
+ def __init__(self, project_path: Path, watcher: CodebaseFileWatcher):
2867
+ """Initialize the event handler.
2868
+
2869
+ Args:
2870
+ project_path: Root path of the project being watched
2871
+ watcher: CodebaseFileWatcher instance to notify
2872
+ """
2873
+ super().__init__()
2874
+ self.project_path = project_path
2875
+ self.watcher = watcher
2876
+
2877
+ def on_created(self, event) -> None:
2878
+ """Called when a file is created."""
2879
+ if not event.is_directory and self._should_index_file(event.src_path):
2880
+ logger.debug(f"File created: {event.src_path}")
2881
+ self.watcher._on_file_changed(Path(event.src_path))
2882
+
2883
+ def on_modified(self, event) -> None:
2884
+ """Called when a file is modified."""
2885
+ if not event.is_directory and self._should_index_file(event.src_path):
2886
+ logger.debug(f"File modified: {event.src_path}")
2887
+ self.watcher._on_file_changed(Path(event.src_path))
2888
+
2889
+ def on_deleted(self, event) -> None:
2890
+ """Called when a file is deleted."""
2891
+ if not event.is_directory and self._should_index_file(event.src_path):
2892
+ logger.debug(f"File deleted: {event.src_path}")
2893
+ self.watcher._on_file_changed(Path(event.src_path))
2894
+
2895
+ def on_moved(self, event) -> None:
2896
+ """Called when a file is moved."""
2897
+ if not event.is_directory:
2898
+ # Check destination path
2899
+ if self._should_index_file(event.dest_path):
2900
+ logger.debug(f"File moved: {event.src_path} -> {event.dest_path}")
2901
+ self.watcher._on_file_changed(Path(event.dest_path))
2902
+ # Also check source path (for deletion case)
2903
+ elif self._should_index_file(event.src_path):
2904
+ logger.debug(f"File moved out: {event.src_path}")
2905
+ self.watcher._on_file_changed(Path(event.src_path))
2906
+
2907
+ def _should_index_file(self, file_path: str) -> bool:
2908
+ """Check if a file should trigger reindexing.
2909
+
2910
+ Filters based on:
2911
+ - File extension (.py only)
2912
+ - Hidden files and directories (starting with .)
2913
+ - Skip directories (venv, __pycache__, .git, node_modules, etc.)
2914
+
2915
+ Args:
2916
+ file_path: Path to the file to check
2917
+
2918
+ Returns:
2919
+ True if file should trigger reindexing, False otherwise
2920
+ """
2921
+ path = Path(file_path)
2922
+
2923
+ # Only .py files
2924
+ if path.suffix != ".py":
2925
+ return False
2926
+
2927
+ # Skip hidden files
2928
+ if path.name.startswith("."):
2929
+ return False
2930
+
2931
+ # Check for skip directories in the path
2932
+ for part in path.parts:
2933
+ if part.startswith("."): # Hidden directories like .git, .venv
2934
+ return False
2935
+ if part in {"__pycache__", "venv", "env", "node_modules"}:
2936
+ return False
2937
+
2938
+ # File is within project (resolve both paths to handle symlinks)
2939
+ try:
2940
+ path.resolve().relative_to(self.project_path)
2941
+ return True
2942
+ except ValueError:
2943
+ # File is outside project
2944
+ return False
2945
+
2946
+ return _FileChangeHandler
2947
+
2948
+
2949
+ # ========================
2950
+ # CHROMADB LOCK CLEANUP
2951
+ # ========================
2952
+
2953
+
2954
+ def _is_process_alive(pid: int) -> bool:
2955
+ """Check if a process with given PID is currently running.
2956
+
2957
+ Cross-platform process existence check.
2958
+
2959
+ Args:
2960
+ pid: Process ID to check
2961
+
2962
+ Returns:
2963
+ True if process exists, False otherwise
2964
+ """
2965
+ import os
2966
+ import sys
2967
+
2968
+ if sys.platform == "win32":
2969
+ # Windows: Use tasklist command
2970
+ import subprocess
2971
+
2972
+ try:
2973
+ result = subprocess.run(
2974
+ ["tasklist", "/FI", f"PID eq {pid}"], capture_output=True, text=True, timeout=2
2975
+ )
2976
+ return str(pid) in result.stdout
2977
+ except Exception:
2978
+ return False
2979
+ else:
2980
+ # Unix/Linux/macOS: Use os.kill(pid, 0)
2981
+ try:
2982
+ os.kill(pid, 0)
2983
+ return True
2984
+ except OSError:
2985
+ return False
2986
+ except Exception:
2987
+ return False
2988
+
2989
+
2990
+ def cleanup_stale_chromadb_locks() -> int:
2991
+ """Remove stale ChromaDB lock files on MCP server startup.
2992
+
2993
+ Scans all vectordb directories and removes lock files that:
2994
+ 1. Are older than 60 seconds (short grace period for active operations)
2995
+ 2. Don't have an owning process running (if PID can be determined)
2996
+
2997
+ This prevents 'Connection closed' errors from dead process locks.
2998
+
2999
+ Returns:
3000
+ Number of stale locks removed
3001
+ """
3002
+ vectordb_base = Path.home() / ".stravinsky" / "vectordb"
3003
+ if not vectordb_base.exists():
3004
+ return 0 # No vectordb yet, nothing to cleanup
3005
+
3006
+ import time
3007
+
3008
+ removed_count = 0
3009
+
3010
+ for project_dir in vectordb_base.iterdir():
3011
+ if not project_dir.is_dir():
3012
+ continue
3013
+
3014
+ lock_path = project_dir / ".chromadb.lock"
3015
+ if not lock_path.exists():
3016
+ continue
3017
+
3018
+ # Check lock age
3019
+ try:
3020
+ lock_age = time.time() - lock_path.stat().st_mtime
3021
+ except Exception:
3022
+ continue
3023
+
3024
+ # Aggressive cleanup: remove locks older than 60 seconds
3025
+ # This catches recently crashed processes (old 300s was too conservative)
3026
+ is_stale = lock_age > 60
3027
+
3028
+ # TODO: If lock file contains PID, check if process is alive
3029
+ # filelock doesn't write PID by default, but we could enhance this
3030
+
3031
+ if is_stale:
3032
+ try:
3033
+ lock_path.unlink(missing_ok=True)
3034
+ removed_count += 1
3035
+ logger.info(f"Removed stale lock: {lock_path} (age: {lock_age:.0f}s)")
3036
+ except Exception as e:
3037
+ logger.warning(f"Could not remove stale lock {lock_path}: {e}")
3038
+
3039
+ if removed_count > 0:
3040
+ logger.info(f"Startup cleanup: removed {removed_count} stale ChromaDB lock(s)")
3041
+
3042
+ return removed_count