stravinsky 0.2.67__py3-none-any.whl → 0.4.66__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of stravinsky might be problematic. Click here for more details.

Files changed (190) hide show
  1. mcp_bridge/__init__.py +1 -1
  2. mcp_bridge/auth/__init__.py +16 -6
  3. mcp_bridge/auth/cli.py +202 -11
  4. mcp_bridge/auth/oauth.py +1 -2
  5. mcp_bridge/auth/openai_oauth.py +4 -7
  6. mcp_bridge/auth/token_store.py +112 -11
  7. mcp_bridge/cli/__init__.py +1 -1
  8. mcp_bridge/cli/install_hooks.py +503 -107
  9. mcp_bridge/cli/session_report.py +0 -3
  10. mcp_bridge/config/MANIFEST_SCHEMA.md +305 -0
  11. mcp_bridge/config/README.md +276 -0
  12. mcp_bridge/config/__init__.py +2 -2
  13. mcp_bridge/config/hook_config.py +247 -0
  14. mcp_bridge/config/hooks_manifest.json +138 -0
  15. mcp_bridge/config/rate_limits.py +317 -0
  16. mcp_bridge/config/skills_manifest.json +128 -0
  17. mcp_bridge/hooks/HOOKS_SETTINGS.json +17 -4
  18. mcp_bridge/hooks/__init__.py +19 -4
  19. mcp_bridge/hooks/agent_reminder.py +4 -4
  20. mcp_bridge/hooks/auto_slash_command.py +5 -5
  21. mcp_bridge/hooks/budget_optimizer.py +2 -2
  22. mcp_bridge/hooks/claude_limits_hook.py +114 -0
  23. mcp_bridge/hooks/comment_checker.py +3 -4
  24. mcp_bridge/hooks/compaction.py +2 -2
  25. mcp_bridge/hooks/context.py +2 -1
  26. mcp_bridge/hooks/context_monitor.py +2 -2
  27. mcp_bridge/hooks/delegation_policy.py +85 -0
  28. mcp_bridge/hooks/directory_context.py +3 -3
  29. mcp_bridge/hooks/edit_recovery.py +3 -2
  30. mcp_bridge/hooks/edit_recovery_policy.py +49 -0
  31. mcp_bridge/hooks/empty_message_sanitizer.py +2 -2
  32. mcp_bridge/hooks/events.py +160 -0
  33. mcp_bridge/hooks/git_noninteractive.py +4 -4
  34. mcp_bridge/hooks/keyword_detector.py +8 -10
  35. mcp_bridge/hooks/manager.py +43 -22
  36. mcp_bridge/hooks/notification_hook.py +13 -6
  37. mcp_bridge/hooks/parallel_enforcement_policy.py +67 -0
  38. mcp_bridge/hooks/parallel_enforcer.py +5 -5
  39. mcp_bridge/hooks/parallel_execution.py +22 -10
  40. mcp_bridge/hooks/post_tool/parallel_validation.py +103 -0
  41. mcp_bridge/hooks/pre_compact.py +8 -9
  42. mcp_bridge/hooks/pre_tool/agent_spawn_validator.py +115 -0
  43. mcp_bridge/hooks/preemptive_compaction.py +2 -3
  44. mcp_bridge/hooks/routing_notifications.py +80 -0
  45. mcp_bridge/hooks/rules_injector.py +11 -19
  46. mcp_bridge/hooks/session_idle.py +4 -4
  47. mcp_bridge/hooks/session_notifier.py +4 -4
  48. mcp_bridge/hooks/session_recovery.py +4 -5
  49. mcp_bridge/hooks/stravinsky_mode.py +1 -1
  50. mcp_bridge/hooks/subagent_stop.py +1 -3
  51. mcp_bridge/hooks/task_validator.py +2 -2
  52. mcp_bridge/hooks/tmux_manager.py +7 -8
  53. mcp_bridge/hooks/todo_delegation.py +4 -1
  54. mcp_bridge/hooks/todo_enforcer.py +180 -10
  55. mcp_bridge/hooks/tool_messaging.py +113 -10
  56. mcp_bridge/hooks/truncation_policy.py +37 -0
  57. mcp_bridge/hooks/truncator.py +1 -2
  58. mcp_bridge/metrics/cost_tracker.py +115 -0
  59. mcp_bridge/native_search.py +93 -0
  60. mcp_bridge/native_watcher.py +118 -0
  61. mcp_bridge/notifications.py +150 -0
  62. mcp_bridge/orchestrator/enums.py +11 -0
  63. mcp_bridge/orchestrator/router.py +165 -0
  64. mcp_bridge/orchestrator/state.py +32 -0
  65. mcp_bridge/orchestrator/visualization.py +14 -0
  66. mcp_bridge/orchestrator/wisdom.py +34 -0
  67. mcp_bridge/prompts/__init__.py +1 -8
  68. mcp_bridge/prompts/dewey.py +1 -1
  69. mcp_bridge/prompts/planner.py +2 -4
  70. mcp_bridge/prompts/stravinsky.py +53 -31
  71. mcp_bridge/proxy/__init__.py +0 -0
  72. mcp_bridge/proxy/client.py +70 -0
  73. mcp_bridge/proxy/model_server.py +157 -0
  74. mcp_bridge/routing/__init__.py +43 -0
  75. mcp_bridge/routing/config.py +250 -0
  76. mcp_bridge/routing/model_tiers.py +135 -0
  77. mcp_bridge/routing/provider_state.py +261 -0
  78. mcp_bridge/routing/task_classifier.py +190 -0
  79. mcp_bridge/server.py +542 -59
  80. mcp_bridge/server_tools.py +738 -6
  81. mcp_bridge/tools/__init__.py +40 -25
  82. mcp_bridge/tools/agent_manager.py +616 -697
  83. mcp_bridge/tools/background_tasks.py +13 -17
  84. mcp_bridge/tools/code_search.py +70 -53
  85. mcp_bridge/tools/continuous_loop.py +0 -1
  86. mcp_bridge/tools/dashboard.py +19 -0
  87. mcp_bridge/tools/find_code.py +296 -0
  88. mcp_bridge/tools/init.py +1 -0
  89. mcp_bridge/tools/list_directory.py +42 -0
  90. mcp_bridge/tools/lsp/__init__.py +12 -5
  91. mcp_bridge/tools/lsp/manager.py +471 -0
  92. mcp_bridge/tools/lsp/tools.py +723 -207
  93. mcp_bridge/tools/model_invoke.py +1195 -273
  94. mcp_bridge/tools/mux_client.py +75 -0
  95. mcp_bridge/tools/project_context.py +1 -2
  96. mcp_bridge/tools/query_classifier.py +406 -0
  97. mcp_bridge/tools/read_file.py +84 -0
  98. mcp_bridge/tools/replace.py +45 -0
  99. mcp_bridge/tools/run_shell_command.py +38 -0
  100. mcp_bridge/tools/search_enhancements.py +347 -0
  101. mcp_bridge/tools/semantic_search.py +3627 -0
  102. mcp_bridge/tools/session_manager.py +0 -2
  103. mcp_bridge/tools/skill_loader.py +0 -1
  104. mcp_bridge/tools/task_runner.py +5 -7
  105. mcp_bridge/tools/templates.py +3 -3
  106. mcp_bridge/tools/tool_search.py +331 -0
  107. mcp_bridge/tools/write_file.py +29 -0
  108. mcp_bridge/update_manager.py +585 -0
  109. mcp_bridge/update_manager_pypi.py +297 -0
  110. mcp_bridge/utils/cache.py +82 -0
  111. mcp_bridge/utils/process.py +71 -0
  112. mcp_bridge/utils/session_state.py +51 -0
  113. mcp_bridge/utils/truncation.py +76 -0
  114. stravinsky-0.4.66.dist-info/METADATA +517 -0
  115. stravinsky-0.4.66.dist-info/RECORD +198 -0
  116. {stravinsky-0.2.67.dist-info → stravinsky-0.4.66.dist-info}/entry_points.txt +1 -0
  117. stravinsky_claude_assets/HOOKS_INTEGRATION.md +316 -0
  118. stravinsky_claude_assets/agents/HOOKS.md +437 -0
  119. stravinsky_claude_assets/agents/code-reviewer.md +210 -0
  120. stravinsky_claude_assets/agents/comment_checker.md +580 -0
  121. stravinsky_claude_assets/agents/debugger.md +254 -0
  122. stravinsky_claude_assets/agents/delphi.md +495 -0
  123. stravinsky_claude_assets/agents/dewey.md +248 -0
  124. stravinsky_claude_assets/agents/explore.md +1198 -0
  125. stravinsky_claude_assets/agents/frontend.md +472 -0
  126. stravinsky_claude_assets/agents/implementation-lead.md +164 -0
  127. stravinsky_claude_assets/agents/momus.md +464 -0
  128. stravinsky_claude_assets/agents/research-lead.md +141 -0
  129. stravinsky_claude_assets/agents/stravinsky.md +730 -0
  130. stravinsky_claude_assets/commands/delphi.md +9 -0
  131. stravinsky_claude_assets/commands/dewey.md +54 -0
  132. stravinsky_claude_assets/commands/git-master.md +112 -0
  133. stravinsky_claude_assets/commands/index.md +49 -0
  134. stravinsky_claude_assets/commands/publish.md +86 -0
  135. stravinsky_claude_assets/commands/review.md +73 -0
  136. stravinsky_claude_assets/commands/str/agent_cancel.md +70 -0
  137. stravinsky_claude_assets/commands/str/agent_list.md +56 -0
  138. stravinsky_claude_assets/commands/str/agent_output.md +92 -0
  139. stravinsky_claude_assets/commands/str/agent_progress.md +74 -0
  140. stravinsky_claude_assets/commands/str/agent_retry.md +94 -0
  141. stravinsky_claude_assets/commands/str/cancel.md +51 -0
  142. stravinsky_claude_assets/commands/str/clean.md +97 -0
  143. stravinsky_claude_assets/commands/str/continue.md +38 -0
  144. stravinsky_claude_assets/commands/str/index.md +199 -0
  145. stravinsky_claude_assets/commands/str/list_watchers.md +96 -0
  146. stravinsky_claude_assets/commands/str/search.md +205 -0
  147. stravinsky_claude_assets/commands/str/start_filewatch.md +136 -0
  148. stravinsky_claude_assets/commands/str/stats.md +71 -0
  149. stravinsky_claude_assets/commands/str/stop_filewatch.md +89 -0
  150. stravinsky_claude_assets/commands/str/unwatch.md +42 -0
  151. stravinsky_claude_assets/commands/str/watch.md +45 -0
  152. stravinsky_claude_assets/commands/strav.md +53 -0
  153. stravinsky_claude_assets/commands/stravinsky.md +292 -0
  154. stravinsky_claude_assets/commands/verify.md +60 -0
  155. stravinsky_claude_assets/commands/version.md +5 -0
  156. stravinsky_claude_assets/hooks/README.md +248 -0
  157. stravinsky_claude_assets/hooks/comment_checker.py +193 -0
  158. stravinsky_claude_assets/hooks/context.py +38 -0
  159. stravinsky_claude_assets/hooks/context_monitor.py +153 -0
  160. stravinsky_claude_assets/hooks/dependency_tracker.py +73 -0
  161. stravinsky_claude_assets/hooks/edit_recovery.py +46 -0
  162. stravinsky_claude_assets/hooks/execution_state_tracker.py +68 -0
  163. stravinsky_claude_assets/hooks/notification_hook.py +103 -0
  164. stravinsky_claude_assets/hooks/notification_hook_v2.py +96 -0
  165. stravinsky_claude_assets/hooks/parallel_execution.py +241 -0
  166. stravinsky_claude_assets/hooks/parallel_reinforcement.py +106 -0
  167. stravinsky_claude_assets/hooks/parallel_reinforcement_v2.py +112 -0
  168. stravinsky_claude_assets/hooks/pre_compact.py +123 -0
  169. stravinsky_claude_assets/hooks/ralph_loop.py +173 -0
  170. stravinsky_claude_assets/hooks/session_recovery.py +263 -0
  171. stravinsky_claude_assets/hooks/stop_hook.py +89 -0
  172. stravinsky_claude_assets/hooks/stravinsky_metrics.py +164 -0
  173. stravinsky_claude_assets/hooks/stravinsky_mode.py +146 -0
  174. stravinsky_claude_assets/hooks/subagent_stop.py +98 -0
  175. stravinsky_claude_assets/hooks/todo_continuation.py +111 -0
  176. stravinsky_claude_assets/hooks/todo_delegation.py +96 -0
  177. stravinsky_claude_assets/hooks/tool_messaging.py +281 -0
  178. stravinsky_claude_assets/hooks/truncator.py +23 -0
  179. stravinsky_claude_assets/rules/deployment_safety.md +51 -0
  180. stravinsky_claude_assets/rules/integration_wiring.md +89 -0
  181. stravinsky_claude_assets/rules/pypi_deployment.md +220 -0
  182. stravinsky_claude_assets/rules/stravinsky_orchestrator.md +32 -0
  183. stravinsky_claude_assets/settings.json +152 -0
  184. stravinsky_claude_assets/skills/chrome-devtools/SKILL.md +81 -0
  185. stravinsky_claude_assets/skills/sqlite/SKILL.md +77 -0
  186. stravinsky_claude_assets/skills/supabase/SKILL.md +74 -0
  187. stravinsky_claude_assets/task_dependencies.json +34 -0
  188. stravinsky-0.2.67.dist-info/METADATA +0 -284
  189. stravinsky-0.2.67.dist-info/RECORD +0 -76
  190. {stravinsky-0.2.67.dist-info → stravinsky-0.4.66.dist-info}/WHEEL +0 -0
@@ -0,0 +1,3627 @@
1
+ """
2
+ Semantic Code Search - Vector-based code understanding
3
+
4
+ Uses ChromaDB for persistent vector storage with multiple embedding providers:
5
+ - Ollama (local, free) - nomic-embed-text (768 dims)
6
+ - Mxbai (local, free) - mxbai-embed-large (1024 dims, better for code)
7
+ - Gemini (cloud, OAuth) - gemini-embedding-001 (768-3072 dims)
8
+ - OpenAI (cloud, OAuth) - text-embedding-3-small (1536 dims)
9
+ - HuggingFace (cloud, token) - sentence-transformers/all-mpnet-base-v2 (768 dims)
10
+
11
+ Enables natural language queries like "find authentication logic" without
12
+ requiring exact pattern matching.
13
+
14
+ Architecture:
15
+ - Per-project ChromaDB storage at ~/.stravinsky/vectordb/<project_hash>/
16
+ - Lazy initialization on first query
17
+ - Provider abstraction for embedding generation
18
+ - Chunking strategy: function/class level with context
19
+ """
20
+
21
+ import asyncio
22
+ import atexit
23
+ import hashlib
24
+ import logging
25
+ import signal
26
+ import sys
27
+ import threading
28
+ from abc import ABC, abstractmethod
29
+ from pathlib import Path
30
+ from typing import TYPE_CHECKING, Literal
31
+
32
+ if TYPE_CHECKING:
33
+ import pathspec
34
+
35
+ import httpx
36
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
37
+
38
+ from mcp_bridge.auth.token_store import TokenStore
39
+ from mcp_bridge.tools.query_classifier import QueryCategory, classify_query
40
+ from mcp_bridge.native_search import native_chunk_code
41
+ from mcp_bridge.native_watcher import NativeFileWatcher
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ # Lazy imports for watchdog (avoid startup cost)
47
+ _watchdog = None
48
+ _watchdog_import_lock = threading.Lock()
49
+
50
+
51
+ def get_watchdog():
52
+ """Lazy import of watchdog components for file watching."""
53
+ global _watchdog
54
+ if _watchdog is None:
55
+ with _watchdog_import_lock:
56
+ if _watchdog is None:
57
+ from watchdog.events import FileSystemEventHandler
58
+ from watchdog.observers import Observer
59
+
60
+ _watchdog = {"Observer": Observer, "FileSystemEventHandler": FileSystemEventHandler}
61
+ return _watchdog
62
+
63
+
64
+ # Embedding provider type
65
+ EmbeddingProvider = Literal["ollama", "mxbai", "gemini", "openai", "huggingface"]
66
+
67
+ # Lazy imports to avoid startup cost
68
+ _chromadb = None
69
+ _ollama = None
70
+ _httpx = None
71
+ _filelock = None
72
+ _import_lock = threading.Lock()
73
+
74
+
75
+ def get_filelock():
76
+ global _filelock
77
+ if _filelock is None:
78
+ with _import_lock:
79
+ if _filelock is None:
80
+ import filelock
81
+
82
+ _filelock = filelock
83
+ return _filelock
84
+
85
+
86
+ def get_chromadb():
87
+ global _chromadb
88
+ if _chromadb is None:
89
+ with _import_lock:
90
+ if _chromadb is None:
91
+ try:
92
+ import chromadb
93
+
94
+ _chromadb = chromadb
95
+ except ImportError as e:
96
+ import sys
97
+
98
+ if sys.version_info >= (3, 14):
99
+ raise ImportError(
100
+ "ChromaDB is not available on Python 3.14+. "
101
+ "Semantic search is not supported on Python 3.14 yet. "
102
+ "Use Python 3.11-3.13 for semantic search features."
103
+ ) from e
104
+ raise
105
+ return _chromadb
106
+
107
+
108
+ def get_ollama():
109
+ global _ollama
110
+ if _ollama is None:
111
+ with _import_lock:
112
+ if _ollama is None:
113
+ import ollama
114
+
115
+ _ollama = ollama
116
+ return _ollama
117
+
118
+
119
+ def get_httpx():
120
+ global _httpx
121
+ if _httpx is None:
122
+ with _import_lock:
123
+ if _httpx is None:
124
+ import httpx
125
+
126
+ _httpx = httpx
127
+ return _httpx
128
+
129
+
130
+ # ========================
131
+ # GITIGNORE MANAGER
132
+ # ========================
133
+
134
+ # Lazy import for pathspec
135
+ _pathspec = None
136
+ _pathspec_lock = threading.Lock()
137
+
138
+
139
+ def get_pathspec():
140
+ """Lazy import of pathspec for gitignore pattern matching."""
141
+ global _pathspec
142
+ if _pathspec is None:
143
+ with _pathspec_lock:
144
+ if _pathspec is None:
145
+ import pathspec
146
+
147
+ _pathspec = pathspec
148
+ return _pathspec
149
+
150
+
151
+ class GitIgnoreManager:
152
+ """Manages .gitignore and .stravignore pattern matching.
153
+
154
+ Loads and caches gitignore-style patterns from:
155
+ - .gitignore (standard git ignore patterns)
156
+ - .stravignore (Stravinsky-specific ignore patterns)
157
+
158
+ Patterns are combined and cached per project for efficient matching.
159
+ The manager automatically reloads patterns if the ignore files are modified.
160
+ """
161
+
162
+ # Cache of GitIgnoreManager instances per project path
163
+ _instances: dict[str, "GitIgnoreManager"] = {}
164
+ _instances_lock = threading.Lock()
165
+
166
+ @classmethod
167
+ def get_instance(cls, project_path: Path) -> "GitIgnoreManager":
168
+ """Get or create a GitIgnoreManager for a project.
169
+
170
+ Args:
171
+ project_path: Root path of the project
172
+
173
+ Returns:
174
+ Cached GitIgnoreManager instance for the project
175
+ """
176
+ path_str = str(project_path.resolve())
177
+ if path_str not in cls._instances:
178
+ with cls._instances_lock:
179
+ if path_str not in cls._instances:
180
+ cls._instances[path_str] = cls(project_path)
181
+ return cls._instances[path_str]
182
+
183
+ @classmethod
184
+ def clear_cache(cls, project_path: Path | None = None) -> None:
185
+ """Clear cached GitIgnoreManager instances.
186
+
187
+ Args:
188
+ project_path: Clear only this project's cache, or all if None
189
+ """
190
+ with cls._instances_lock:
191
+ if project_path is None:
192
+ cls._instances.clear()
193
+ else:
194
+ path_str = str(project_path.resolve())
195
+ cls._instances.pop(path_str, None)
196
+
197
+ def __init__(self, project_path: Path):
198
+ """Initialize the GitIgnoreManager.
199
+
200
+ Args:
201
+ project_path: Root path of the project
202
+ """
203
+ self.project_path = project_path.resolve()
204
+ self._spec = None
205
+ self._gitignore_mtime: float | None = None
206
+ self._stravignore_mtime: float | None = None
207
+ self._lock = threading.Lock()
208
+
209
+ def _get_file_mtime(self, file_path: Path) -> float | None:
210
+ """Get modification time of a file, or None if it doesn't exist."""
211
+ try:
212
+ return file_path.stat().st_mtime
213
+ except (OSError, FileNotFoundError):
214
+ return None
215
+
216
+ def _needs_reload(self) -> bool:
217
+ """Check if ignore patterns need to be reloaded."""
218
+ gitignore_path = self.project_path / ".gitignore"
219
+ stravignore_path = self.project_path / ".stravignore"
220
+
221
+ current_gitignore_mtime = self._get_file_mtime(gitignore_path)
222
+ current_stravignore_mtime = self._get_file_mtime(stravignore_path)
223
+
224
+ # Check if either file has been modified or if we haven't loaded yet
225
+ if self._spec is None:
226
+ return True
227
+
228
+ if current_gitignore_mtime != self._gitignore_mtime:
229
+ return True
230
+
231
+ if current_stravignore_mtime != self._stravignore_mtime:
232
+ return True
233
+
234
+ return False
235
+
236
+ def _load_patterns(self) -> None:
237
+ """Load patterns from .gitignore and .stravignore files."""
238
+ pathspec = get_pathspec()
239
+
240
+ patterns = []
241
+ gitignore_path = self.project_path / ".gitignore"
242
+ stravignore_path = self.project_path / ".stravignore"
243
+
244
+ # Load .gitignore patterns
245
+ if gitignore_path.exists():
246
+ try:
247
+ with open(gitignore_path, encoding="utf-8") as f:
248
+ patterns.extend(f.read().splitlines())
249
+ self._gitignore_mtime = self._get_file_mtime(gitignore_path)
250
+ logger.debug(f"Loaded .gitignore from {gitignore_path}")
251
+ except Exception as e:
252
+ logger.warning(f"Failed to load .gitignore: {e}")
253
+ self._gitignore_mtime = None
254
+ else:
255
+ self._gitignore_mtime = None
256
+
257
+ # Load .stravignore patterns
258
+ if stravignore_path.exists():
259
+ try:
260
+ with open(stravignore_path, encoding="utf-8") as f:
261
+ patterns.extend(f.read().splitlines())
262
+ self._stravignore_mtime = self._get_file_mtime(stravignore_path)
263
+ logger.debug(f"Loaded .stravignore from {stravignore_path}")
264
+ except Exception as e:
265
+ logger.warning(f"Failed to load .stravignore: {e}")
266
+ self._stravignore_mtime = None
267
+ else:
268
+ self._stravignore_mtime = None
269
+
270
+ # Filter out empty lines and comments
271
+ patterns = [p for p in patterns if p.strip() and not p.strip().startswith("#")]
272
+
273
+ # Create pathspec matcher
274
+ self._spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns)
275
+ logger.debug(f"Loaded {len(patterns)} ignore patterns for {self.project_path}")
276
+
277
+ @property
278
+ def spec(self):
279
+ """Get the PathSpec matcher, reloading if necessary."""
280
+ with self._lock:
281
+ if self._needs_reload():
282
+ self._load_patterns()
283
+ return self._spec
284
+
285
+ def is_ignored(self, file_path: Path) -> bool:
286
+ """Check if a file path should be ignored.
287
+
288
+ Args:
289
+ file_path: Absolute or relative path to check
290
+
291
+ Returns:
292
+ True if the file matches any ignore pattern, False otherwise
293
+ """
294
+ try:
295
+ # Convert to relative path from project root
296
+ if file_path.is_absolute():
297
+ rel_path = file_path.resolve().relative_to(self.project_path)
298
+ else:
299
+ rel_path = file_path
300
+
301
+ # pathspec expects forward slashes and string paths
302
+ rel_path_str = str(rel_path).replace("\\", "/")
303
+
304
+ # Check against patterns
305
+ spec = self.spec
306
+ if spec is None:
307
+ return False # No patterns loaded, nothing is ignored
308
+ return spec.match_file(rel_path_str)
309
+ except ValueError:
310
+ # Path is outside project - not ignored by gitignore (but may be ignored for other reasons)
311
+ return False
312
+ except Exception as e:
313
+ logger.warning(f"Error checking ignore status for {file_path}: {e}")
314
+ return False
315
+
316
+ def filter_paths(self, paths: list[Path]) -> list[Path]:
317
+ """Filter a list of paths, removing ignored ones.
318
+
319
+ Args:
320
+ paths: List of paths to filter
321
+
322
+ Returns:
323
+ List of paths that are not ignored
324
+ """
325
+ return [p for p in paths if not self.is_ignored(p)]
326
+
327
+
328
+ # ========================
329
+ # EMBEDDING PROVIDERS
330
+ # ========================
331
+
332
+
333
+ class BaseEmbeddingProvider(ABC):
334
+ """Abstract base class for embedding providers."""
335
+
336
+ @abstractmethod
337
+ async def get_embedding(self, text: str) -> list[float]:
338
+ """Get embedding vector for text."""
339
+ pass
340
+
341
+ @abstractmethod
342
+ async def check_available(self) -> bool:
343
+ """Check if the provider is available and ready."""
344
+ pass
345
+
346
+ @property
347
+ @abstractmethod
348
+ def dimension(self) -> int:
349
+ """Return the embedding dimension for this provider."""
350
+ pass
351
+
352
+ @property
353
+ @abstractmethod
354
+ def name(self) -> str:
355
+ """Return the provider name."""
356
+ pass
357
+
358
+
359
+ class OllamaProvider(BaseEmbeddingProvider):
360
+ """Ollama local embedding provider using nomic-embed-text."""
361
+
362
+ MODEL = "nomic-embed-text"
363
+ DIMENSION = 768
364
+
365
+ def __init__(self):
366
+ self._available: bool | None = None
367
+
368
+ @property
369
+ def dimension(self) -> int:
370
+ return self.DIMENSION
371
+
372
+ @property
373
+ def name(self) -> str:
374
+ return "ollama"
375
+
376
+ async def check_available(self) -> bool:
377
+ if self._available is not None:
378
+ return self._available
379
+
380
+ try:
381
+ ollama = get_ollama()
382
+ models = ollama.list()
383
+ model_names = [m.model for m in models.models] if hasattr(models, "models") else []
384
+
385
+ if not any(name and self.MODEL in name for name in model_names):
386
+ print(
387
+ f"⚠️ Embedding model '{self.MODEL}' not found. Run: ollama pull {self.MODEL}",
388
+ file=sys.stderr,
389
+ )
390
+ self._available = False
391
+ return False
392
+
393
+ self._available = True
394
+ return True
395
+ except Exception as e:
396
+ print(f"⚠️ Ollama not available: {e}. Start with: ollama serve", file=sys.stderr)
397
+ self._available = False
398
+ return False
399
+
400
+ async def get_embedding(self, text: str) -> list[float]:
401
+ ollama = get_ollama()
402
+ # nomic-embed-text has 8192 token context. Code can be 1-2 chars/token.
403
+ # Truncate to 2000 chars (~1000-2000 tokens) for larger safety margin
404
+ truncated = text[:2000] if len(text) > 2000 else text
405
+ response = ollama.embeddings(model=self.MODEL, prompt=truncated)
406
+ return response["embedding"]
407
+
408
+
409
+ class GeminiProvider(BaseEmbeddingProvider):
410
+ """Gemini embedding provider using OAuth authentication."""
411
+
412
+ MODEL = "gemini-embedding-001"
413
+ DIMENSION = 768 # Using 768 for efficiency, can be up to 3072
414
+
415
+ def __init__(self):
416
+ self._available: bool | None = None
417
+ self._token_store = None
418
+
419
+ def _get_token_store(self):
420
+ if self._token_store is None:
421
+ from ..auth.token_store import TokenStore
422
+
423
+ self._token_store = TokenStore()
424
+ return self._token_store
425
+
426
+ @property
427
+ def dimension(self) -> int:
428
+ return self.DIMENSION
429
+
430
+ @property
431
+ def name(self) -> str:
432
+ return "gemini"
433
+
434
+ async def check_available(self) -> bool:
435
+ if self._available is not None:
436
+ return self._available
437
+
438
+ try:
439
+ token_store = self._get_token_store()
440
+ access_token = token_store.get_access_token("gemini")
441
+
442
+ if not access_token:
443
+ print(
444
+ "⚠️ Gemini not authenticated. Run: stravinsky-auth login gemini",
445
+ file=sys.stderr,
446
+ )
447
+ self._available = False
448
+ return False
449
+
450
+ self._available = True
451
+ return True
452
+ except Exception as e:
453
+ print(f"⚠️ Gemini not available: {e}", file=sys.stderr)
454
+ self._available = False
455
+ return False
456
+
457
+ async def get_embedding(self, text: str) -> list[float]:
458
+ import os
459
+
460
+ from ..auth.oauth import (
461
+ ANTIGRAVITY_DEFAULT_PROJECT_ID,
462
+ ANTIGRAVITY_ENDPOINTS,
463
+ ANTIGRAVITY_HEADERS,
464
+ )
465
+
466
+ token_store = self._get_token_store()
467
+ access_token = token_store.get_access_token("gemini")
468
+
469
+ if not access_token:
470
+ raise ValueError("Not authenticated with Gemini. Run: stravinsky-auth login gemini")
471
+
472
+ httpx = get_httpx()
473
+
474
+ # Use Antigravity endpoint for embeddings (same auth as invoke_gemini)
475
+ project_id = os.getenv("STRAVINSKY_ANTIGRAVITY_PROJECT_ID", ANTIGRAVITY_DEFAULT_PROJECT_ID)
476
+
477
+ headers = {
478
+ "Authorization": f"Bearer {access_token}",
479
+ "Content-Type": "application/json",
480
+ **ANTIGRAVITY_HEADERS,
481
+ }
482
+
483
+ # Wrap request for Antigravity API
484
+ import uuid
485
+
486
+ inner_payload = {
487
+ "model": f"models/{self.MODEL}",
488
+ "content": {"parts": [{"text": text}]},
489
+ "outputDimensionality": self.DIMENSION,
490
+ }
491
+
492
+ wrapped_payload = {
493
+ "project": project_id,
494
+ "model": self.MODEL,
495
+ "userAgent": "antigravity",
496
+ "requestId": f"embed-{uuid.uuid4()}",
497
+ "request": inner_payload,
498
+ }
499
+
500
+ # Try endpoints in order
501
+ last_error = None
502
+ async with httpx.AsyncClient(timeout=60.0) as client:
503
+ for endpoint in ANTIGRAVITY_ENDPOINTS:
504
+ api_url = f"{endpoint}/v1internal:embedContent"
505
+
506
+ try:
507
+ response = await client.post(
508
+ api_url,
509
+ headers=headers,
510
+ json=wrapped_payload,
511
+ )
512
+
513
+ if response.status_code in (401, 403):
514
+ last_error = Exception(f"{response.status_code} from {endpoint}")
515
+ continue
516
+
517
+ response.raise_for_status()
518
+ data = response.json()
519
+
520
+ # Extract embedding from response
521
+ inner_response = data.get("response", data)
522
+ embedding = inner_response.get("embedding", {})
523
+ values = embedding.get("values", [])
524
+
525
+ if values:
526
+ return values
527
+
528
+ raise ValueError(f"No embedding values in response: {data}")
529
+
530
+ except Exception as e:
531
+ last_error = e
532
+ continue
533
+
534
+ raise ValueError(f"All Antigravity endpoints failed for embeddings: {last_error}")
535
+
536
+
537
+ class OpenAIProvider(BaseEmbeddingProvider):
538
+ """OpenAI embedding provider using OAuth authentication."""
539
+
540
+ MODEL = "text-embedding-3-small"
541
+ DIMENSION = 1536
542
+
543
+ def __init__(self):
544
+ self._available: bool | None = None
545
+ self._token_store = None
546
+
547
+ def _get_token_store(self):
548
+ if self._token_store is None:
549
+ from ..auth.token_store import TokenStore
550
+
551
+ self._token_store = TokenStore()
552
+ return self._token_store
553
+
554
+ @property
555
+ def dimension(self) -> int:
556
+ return self.DIMENSION
557
+
558
+ @property
559
+ def name(self) -> str:
560
+ return "openai"
561
+
562
+ async def check_available(self) -> bool:
563
+ if self._available is not None:
564
+ return self._available
565
+
566
+ try:
567
+ token_store = self._get_token_store()
568
+ access_token = token_store.get_access_token("openai")
569
+
570
+ if not access_token:
571
+ print(
572
+ "⚠️ OpenAI not authenticated. Run: stravinsky-auth login openai",
573
+ file=sys.stderr,
574
+ )
575
+ self._available = False
576
+ return False
577
+
578
+ self._available = True
579
+ return True
580
+ except Exception as e:
581
+ print(f"⚠️ OpenAI not available: {e}", file=sys.stderr)
582
+ self._available = False
583
+ return False
584
+
585
+ async def get_embedding(self, text: str) -> list[float]:
586
+ token_store = self._get_token_store()
587
+ access_token = token_store.get_access_token("openai")
588
+
589
+ if not access_token:
590
+ raise ValueError("Not authenticated with OpenAI. Run: stravinsky-auth login openai")
591
+
592
+ httpx = get_httpx()
593
+
594
+ # Use standard OpenAI API for embeddings
595
+ api_url = "https://api.openai.com/v1/embeddings"
596
+
597
+ headers = {
598
+ "Authorization": f"Bearer {access_token}",
599
+ "Content-Type": "application/json",
600
+ }
601
+
602
+ payload = {
603
+ "model": self.MODEL,
604
+ "input": text,
605
+ }
606
+
607
+ async with httpx.AsyncClient(timeout=60.0) as client:
608
+ response = await client.post(api_url, headers=headers, json=payload)
609
+
610
+ if response.status_code == 401:
611
+ raise ValueError("OpenAI authentication failed. Run: stravinsky-auth login openai")
612
+
613
+ response.raise_for_status()
614
+ data = response.json()
615
+
616
+ # Extract embedding from response
617
+ embeddings = data.get("data", [])
618
+ if embeddings and "embedding" in embeddings[0]:
619
+ return embeddings[0]["embedding"]
620
+
621
+ raise ValueError(f"No embedding in response: {data}")
622
+
623
+
624
+ class MxbaiProvider(BaseEmbeddingProvider):
625
+ """Ollama local embedding provider using mxbai-embed-large (better for code).
626
+
627
+ mxbai-embed-large is a 1024-dimensional model optimized for code understanding.
628
+ It generally outperforms nomic-embed-text on code-related retrieval tasks.
629
+ """
630
+
631
+ MODEL = "mxbai-embed-large"
632
+ DIMENSION = 1024
633
+
634
+ def __init__(self):
635
+ self._available: bool | None = None
636
+
637
+ @property
638
+ def dimension(self) -> int:
639
+ return self.DIMENSION
640
+
641
+ @property
642
+ def name(self) -> str:
643
+ return "mxbai"
644
+
645
+ async def check_available(self) -> bool:
646
+ if self._available is not None:
647
+ return self._available
648
+
649
+ try:
650
+ ollama = get_ollama()
651
+ models = ollama.list()
652
+ model_names = [m.model for m in models.models] if hasattr(models, "models") else []
653
+
654
+ if not any(name and self.MODEL in name for name in model_names):
655
+ print(
656
+ f"⚠️ Embedding model '{self.MODEL}' not found. Run: ollama pull {self.MODEL}",
657
+ file=sys.stderr,
658
+ )
659
+ self._available = False
660
+ return False
661
+
662
+ self._available = True
663
+ return True
664
+ except Exception as e:
665
+ print(f"⚠️ Ollama not available: {e}. Start with: ollama serve", file=sys.stderr)
666
+ self._available = False
667
+ return False
668
+
669
+ async def get_embedding(self, text: str) -> list[float]:
670
+ ollama = get_ollama()
671
+ # mxbai-embed-large has 512 token context. Code can be 1-2 chars/token.
672
+ # Truncate to 2000 chars (~1000-2000 tokens) for safety margin
673
+ truncated = text[:2000] if len(text) > 2000 else text
674
+ response = ollama.embeddings(model=self.MODEL, prompt=truncated)
675
+ return response["embedding"]
676
+
677
+
678
+ class HuggingFaceProvider(BaseEmbeddingProvider):
679
+ """Hugging Face Inference API embedding provider.
680
+
681
+ Uses the Hugging Face Inference API for embeddings. Requires HF_TOKEN from:
682
+ 1. Environment variable: HF_TOKEN or HUGGING_FACE_HUB_TOKEN
683
+ 2. HF CLI config: ~/.cache/huggingface/token or ~/.huggingface/token
684
+
685
+ Default model: sentence-transformers/all-mpnet-base-v2 (768 dims, high quality)
686
+ """
687
+
688
+ DEFAULT_MODEL = "sentence-transformers/all-mpnet-base-v2"
689
+ DEFAULT_DIMENSION = 768
690
+
691
+ def __init__(self, model: str | None = None):
692
+ self._available: bool | None = None
693
+ self._model = model or self.DEFAULT_MODEL
694
+ # Dimension varies by model, but we'll use default for common models
695
+ self._dimension = self.DEFAULT_DIMENSION
696
+ self._token: str | None = None
697
+
698
+ @property
699
+ def dimension(self) -> int:
700
+ return self._dimension
701
+
702
+ @property
703
+ def name(self) -> str:
704
+ return "huggingface"
705
+
706
+ def _get_hf_token(self) -> str | None:
707
+ """Discover HF token from environment or CLI config."""
708
+ import os
709
+
710
+ # Check environment variables first
711
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
712
+ if token:
713
+ return token
714
+
715
+ # Check HF CLI config locations
716
+ hf_token_paths = [
717
+ Path.home() / ".cache" / "huggingface" / "token",
718
+ Path.home() / ".huggingface" / "token",
719
+ ]
720
+
721
+ for token_path in hf_token_paths:
722
+ if token_path.exists():
723
+ try:
724
+ return token_path.read_text().strip()
725
+ except Exception:
726
+ continue
727
+
728
+ return None
729
+
730
+ async def check_available(self) -> bool:
731
+ if self._available is not None:
732
+ return self._available
733
+
734
+ try:
735
+ self._token = self._get_hf_token()
736
+ if not self._token:
737
+ print(
738
+ "⚠️ Hugging Face token not found. Run: huggingface-cli login or set HF_TOKEN env var",
739
+ file=sys.stderr,
740
+ )
741
+ self._available = False
742
+ return False
743
+
744
+ self._available = True
745
+ return True
746
+ except Exception as e:
747
+ print(f"⚠️ Hugging Face not available: {e}", file=sys.stderr)
748
+ self._available = False
749
+ return False
750
+
751
+ @retry(
752
+ stop=stop_after_attempt(3),
753
+ wait=wait_exponential(multiplier=1, min=2, max=10),
754
+ retry=retry_if_exception_type(httpx.HTTPStatusError),
755
+ )
756
+ async def get_embedding(self, text: str) -> list[float]:
757
+ """Get embedding from HF Inference API with retry logic."""
758
+ if not self._token:
759
+ self._token = self._get_hf_token()
760
+ if not self._token:
761
+ raise ValueError(
762
+ "Hugging Face token not found. Run: huggingface-cli login or set HF_TOKEN"
763
+ )
764
+
765
+ httpx_client = get_httpx()
766
+
767
+ # HF Serverless Inference API endpoint
768
+ # Note: Free tier may have limited availability for some models
769
+ api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{self._model}"
770
+
771
+ headers = {
772
+ "Authorization": f"Bearer {self._token}",
773
+ }
774
+
775
+ # Truncate text to reasonable length (most models have 512 token limit)
776
+ # ~2000 chars ≈ 500 tokens for safety
777
+ truncated = text[:2000] if len(text) > 2000 else text
778
+
779
+ # HF Inference API accepts raw JSON with inputs field
780
+ payload = {"inputs": [truncated], "options": {"wait_for_model": True}}
781
+
782
+ async with httpx_client.AsyncClient(timeout=60.0) as client:
783
+ response = await client.post(api_url, headers=headers, json=payload)
784
+
785
+ # Handle specific error codes
786
+ if response.status_code == 401:
787
+ raise ValueError(
788
+ "Hugging Face authentication failed. Run: huggingface-cli login or set HF_TOKEN"
789
+ )
790
+ elif response.status_code == 410:
791
+ # Model removed from free tier
792
+ raise ValueError(
793
+ f"Model {self._model} is no longer available on HF free Inference API (410 Gone). "
794
+ "Try a different model or use Ollama for local embeddings instead."
795
+ )
796
+ elif response.status_code == 503:
797
+ # Model loading - retry will handle this
798
+ logger.info(f"Model {self._model} is loading, retrying...")
799
+ response.raise_for_status()
800
+ elif response.status_code == 429:
801
+ # Rate limit - retry will handle with exponential backoff
802
+ logger.warning("HF API rate limit hit, retrying with backoff...")
803
+ response.raise_for_status()
804
+
805
+ response.raise_for_status()
806
+
807
+ # Response is a single embedding vector (list of floats)
808
+ embedding = response.json()
809
+
810
+ # Handle different response formats
811
+ if isinstance(embedding, list):
812
+ # Direct embedding or batch with single item
813
+ if isinstance(embedding[0], (int, float)):
814
+ return embedding
815
+ elif isinstance(embedding[0], list):
816
+ # Batch response with single embedding
817
+ return embedding[0]
818
+
819
+ raise ValueError(f"Unexpected response format from HF API: {type(embedding)}")
820
+
821
+ async def embed_batch(self, texts: list[str]) -> list[list[float]]:
822
+ """Batch embedding support for HF API.
823
+
824
+ HF API supports batch requests, so we can send multiple texts at once.
825
+ """
826
+ if not texts:
827
+ return []
828
+
829
+ if not self._token:
830
+ self._token = self._get_hf_token()
831
+ if not self._token:
832
+ raise ValueError(
833
+ "Hugging Face token not found. Run: huggingface-cli login or set HF_TOKEN"
834
+ )
835
+
836
+ httpx_client = get_httpx()
837
+
838
+ # HF Serverless Inference API endpoint
839
+ api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{self._model}"
840
+
841
+ headers = {
842
+ "Authorization": f"Bearer {self._token}",
843
+ }
844
+
845
+ # Truncate all texts
846
+ truncated_texts = [text[:2000] if len(text) > 2000 else text for text in texts]
847
+
848
+ payload = {"inputs": truncated_texts, "options": {"wait_for_model": True}}
849
+
850
+ async with httpx_client.AsyncClient(timeout=120.0) as client:
851
+ response = await client.post(api_url, headers=headers, json=payload)
852
+
853
+ if response.status_code == 401:
854
+ raise ValueError(
855
+ "Hugging Face authentication failed. Run: huggingface-cli login or set HF_TOKEN"
856
+ )
857
+
858
+ response.raise_for_status()
859
+
860
+ embeddings = response.json()
861
+
862
+ # Response should be a list of embeddings
863
+ if isinstance(embeddings, list) and all(isinstance(e, list) for e in embeddings):
864
+ return embeddings
865
+
866
+ raise ValueError(f"Unexpected batch response format from HF API: {type(embeddings)}")
867
+
868
+
869
+ # Embedding provider instance cache
870
+ _embedding_provider_cache: dict[str, BaseEmbeddingProvider] = {}
871
+ _embedding_provider_lock = threading.Lock()
872
+
873
+
874
+ def get_embedding_provider(provider: EmbeddingProvider) -> BaseEmbeddingProvider:
875
+ """Factory function to get an embedding provider instance with caching."""
876
+ if provider not in _embedding_provider_cache:
877
+ with _embedding_provider_lock:
878
+ # Double-check pattern to avoid race condition
879
+ if provider not in _embedding_provider_cache:
880
+ providers = {
881
+ "ollama": OllamaProvider,
882
+ "mxbai": MxbaiProvider,
883
+ "gemini": GeminiProvider,
884
+ "openai": OpenAIProvider,
885
+ "huggingface": HuggingFaceProvider,
886
+ }
887
+
888
+ if provider not in providers:
889
+ raise ValueError(
890
+ f"Unknown provider: {provider}. Available: {list(providers.keys())}"
891
+ )
892
+
893
+ _embedding_provider_cache[provider] = providers[provider]()
894
+
895
+ return _embedding_provider_cache[provider]
896
+
897
+
898
+ class CodebaseVectorStore:
899
+ """
900
+ Persistent vector store for a single codebase.
901
+
902
+ Storage: ~/.stravinsky/vectordb/<project_hash>_<provider>/
903
+ Embedding: Configurable via provider (ollama, gemini, openai)
904
+ """
905
+
906
+ CHUNK_SIZE = 50 # lines per chunk
907
+ CHUNK_OVERLAP = 10 # lines of overlap between chunks
908
+
909
+ # File patterns to index
910
+ CODE_EXTENSIONS = {
911
+ ".py",
912
+ ".js",
913
+ ".ts",
914
+ ".tsx",
915
+ ".jsx",
916
+ ".go",
917
+ ".rs",
918
+ ".rb",
919
+ ".java",
920
+ ".c",
921
+ ".cpp",
922
+ ".h",
923
+ ".hpp",
924
+ ".cs",
925
+ ".swift",
926
+ ".kt",
927
+ ".scala",
928
+ ".vue",
929
+ ".svelte",
930
+ ".md",
931
+ ".txt",
932
+ ".yaml",
933
+ ".yml",
934
+ ".json",
935
+ ".toml",
936
+ }
937
+
938
+ # Directories to skip (non-code related)
939
+ SKIP_DUW = {
940
+ # Python
941
+ "__pycache__",
942
+ ".venv",
943
+ "venv",
944
+ "env",
945
+ ".env",
946
+ "virtualenv",
947
+ ".virtualenv",
948
+ ".tox",
949
+ ".nox",
950
+ ".pytest_cache",
951
+ ".mypy_cache",
952
+ ".ruff_cache",
953
+ ".pytype",
954
+ ".pyre",
955
+ "*.egg-info",
956
+ ".eggs",
957
+ "pip-wheel-metadata",
958
+ # Node.js
959
+ "node_modules",
960
+ ".npm",
961
+ ".yarn",
962
+ ".pnpm-store",
963
+ "bower_components",
964
+ # Build outputs
965
+ "dist",
966
+ "build",
967
+ "out",
968
+ "_build",
969
+ ".next",
970
+ ".nuxt",
971
+ ".output",
972
+ ".cache",
973
+ ".parcel-cache",
974
+ ".turbo",
975
+ # Version control
976
+ ".git",
977
+ ".svn",
978
+ ".hg",
979
+ # IDE/Editor
980
+ ".idea",
981
+ ".vscode",
982
+ ".vs",
983
+ # Test/coverage
984
+ "coverage",
985
+ "htmlcov",
986
+ ".coverage",
987
+ ".nyc_output",
988
+ # Rust/Go/Java
989
+ "target",
990
+ "vendor",
991
+ "Godeps",
992
+ # Misc
993
+ ".stravinsky",
994
+ "scratches",
995
+ "consoles",
996
+ "logs",
997
+ "tmp",
998
+ "temp",
999
+ }
1000
+
1001
+ @staticmethod
1002
+ def _normalize_project_path(path: str) -> Path:
1003
+ """
1004
+ Normalize project path to git root if available.
1005
+
1006
+ This ensures one index per repo regardless of invocation directory.
1007
+ If not a git repo, returns resolved absolute path.
1008
+ """
1009
+ import subprocess
1010
+
1011
+ resolved = Path(path).resolve()
1012
+
1013
+ # Try to find git root
1014
+ try:
1015
+ result = subprocess.run(
1016
+ ["git", "-C", str(resolved), "rev-parse", "--show-toplevel"],
1017
+ capture_output=True,
1018
+ text=True,
1019
+ timeout=2,
1020
+ check=False,
1021
+ )
1022
+ if result.returncode == 0:
1023
+ git_root = Path(result.stdout.strip())
1024
+ logger.debug(f"Normalized {resolved} → {git_root} (git root)")
1025
+ return git_root
1026
+ except (subprocess.TimeoutExpired, FileNotFoundError):
1027
+ pass
1028
+
1029
+ # Not a git repo or git not available, use resolved path
1030
+ return resolved
1031
+
1032
+ def __init__(
1033
+ self,
1034
+ project_path: str,
1035
+ provider: EmbeddingProvider = "ollama",
1036
+ base_path: Path | None = None,
1037
+ ):
1038
+ self.project_path = self._normalize_project_path(project_path)
1039
+ self.repo_name = self.project_path.name
1040
+
1041
+ # Initialize embedding provider
1042
+ self.provider_name = provider
1043
+ self.provider = get_embedding_provider(provider)
1044
+
1045
+ # Store in provided base_path or user's home directory
1046
+ # Separate by provider to avoid dimension mismatch
1047
+ if base_path:
1048
+ self.db_path = base_path / f"{self.repo_name}_{provider}"
1049
+ else:
1050
+ self.db_path = Path.home() / ".stravinsky" / "vectordb" / f"{self.repo_name}_{provider}"
1051
+
1052
+ self.db_path.mkdir(parents=True, exist_ok=True)
1053
+
1054
+ # File lock for single-process access to ChromaDB (prevents corruption)
1055
+ self._lock_path = self.db_path / ".chromadb.lock"
1056
+ self._file_lock = None
1057
+
1058
+ self._client = None
1059
+ self._collection = None
1060
+
1061
+ # File watcher attributes
1062
+ self._watcher: CodebaseFileWatcher | None = None
1063
+ self._watcher_lock = threading.Lock()
1064
+
1065
+ # Cancellation flag for indexing operations
1066
+ self._cancel_indexing = False
1067
+ self._cancel_lock = threading.Lock()
1068
+
1069
+ @property
1070
+ def file_lock(self):
1071
+ """Get or create the file lock for this database.
1072
+
1073
+ Uses filelock to ensure single-process access to ChromaDB,
1074
+ preventing database corruption from concurrent writes.
1075
+ """
1076
+ if self._file_lock is None:
1077
+ filelock = get_filelock()
1078
+ # Timeout of 30 seconds - if lock can't be acquired, raise error
1079
+ self._file_lock = filelock.FileLock(str(self._lock_path), timeout=30)
1080
+ return self._file_lock
1081
+
1082
+ @property
1083
+ def client(self):
1084
+ if self._client is None:
1085
+ chromadb = get_chromadb()
1086
+
1087
+ # Check for stale lock before attempting acquisition
1088
+ # Prevents 30s timeout from dead processes causing MCP "Connection closed" errors
1089
+ if self._lock_path.exists():
1090
+ import time
1091
+
1092
+ lock_age = time.time() - self._lock_path.stat().st_mtime
1093
+ # Lock older than 60 seconds is likely from a crashed process
1094
+ # (Reduced from 300s to catch recently crashed processes)
1095
+ if lock_age > 60:
1096
+ logger.warning(
1097
+ f"Removing stale ChromaDB lock (age: {lock_age:.0f}s, path: {self._lock_path})"
1098
+ )
1099
+ try:
1100
+ self._lock_path.unlink(missing_ok=True)
1101
+ except Exception as e:
1102
+ logger.warning(f"Could not remove stale lock: {e}")
1103
+
1104
+ # Acquire lock before creating client to prevent concurrent access
1105
+ try:
1106
+ with self.file_lock: # Auto-releases on exit
1107
+ logger.debug(f"Acquired ChromaDB lock for {self.db_path}")
1108
+ self._client = chromadb.PersistentClient(path=str(self.db_path))
1109
+ except Exception as e:
1110
+ logger.warning(f"Could not acquire ChromaDB lock: {e}. Proceeding without lock.")
1111
+ self._client = chromadb.PersistentClient(path=str(self.db_path))
1112
+ return self._client
1113
+
1114
+ @property
1115
+ def collection(self):
1116
+ if self._collection is None:
1117
+ self._collection = self.client.get_or_create_collection(
1118
+ name="codebase", metadata={"hnsw:space": "cosine"}
1119
+ )
1120
+ return self._collection
1121
+
1122
+ async def check_embedding_service(self) -> bool:
1123
+ """Check if the embedding provider is available."""
1124
+ return await self.provider.check_available()
1125
+
1126
+ async def get_embedding(self, text: str) -> list[float]:
1127
+ """Get embedding vector for text using the configured provider."""
1128
+ return await self.provider.get_embedding(text)
1129
+
1130
+ async def get_embeddings_batch(
1131
+ self, texts: list[str], max_concurrent: int = 10
1132
+ ) -> list[list[float]]:
1133
+ """Get embeddings for multiple texts with parallel execution.
1134
+
1135
+ Uses asyncio.gather with semaphore-based concurrency control to avoid
1136
+ overwhelming the embedding service while maximizing throughput.
1137
+
1138
+ Args:
1139
+ texts: List of text strings to embed
1140
+ max_concurrent: Maximum concurrent embedding requests (default: 10)
1141
+
1142
+ Returns:
1143
+ List of embedding vectors in the same order as input texts.
1144
+ """
1145
+ import asyncio
1146
+
1147
+ if not texts:
1148
+ return []
1149
+
1150
+ # Use semaphore to limit concurrent requests
1151
+ semaphore = asyncio.Semaphore(max_concurrent)
1152
+
1153
+ async def get_with_semaphore(text: str, index: int) -> tuple[int, list[float]]:
1154
+ async with semaphore:
1155
+ emb = await self.get_embedding(text)
1156
+ return (index, emb)
1157
+
1158
+ # Launch all embedding requests concurrently (respecting semaphore)
1159
+ tasks = [get_with_semaphore(text, i) for i, text in enumerate(texts)]
1160
+ results = await asyncio.gather(*tasks)
1161
+
1162
+ # Sort by original index to maintain order
1163
+ sorted_results = sorted(results, key=lambda x: x[0])
1164
+ return [emb for _, emb in sorted_results]
1165
+
1166
+ async def _chunk_file(self, file_path: Path) -> list[dict]:
1167
+ """Split a file into chunks with metadata.
1168
+
1169
+ Uses AST-aware chunking for Python files to respect function/class
1170
+ boundaries. Falls back to line-based chunking for other languages.
1171
+ """
1172
+ try:
1173
+ content = file_path.read_text(encoding="utf-8", errors="ignore")
1174
+ except Exception:
1175
+ return []
1176
+
1177
+ lines = content.split("\n")
1178
+ if len(lines) < 5: # Skip very small files
1179
+ return []
1180
+
1181
+ rel_path = str(file_path.resolve().relative_to(self.project_path.resolve()))
1182
+ language = file_path.suffix.lstrip(".")
1183
+
1184
+ # Try native AST-aware chunking first
1185
+ native_results = await native_chunk_code(content, language)
1186
+ if native_results:
1187
+ chunks = []
1188
+ for nc in native_results:
1189
+ start_line = nc["start_line"]
1190
+ end_line = nc["end_line"]
1191
+ chunk_text = nc["content"]
1192
+ content_hash = hashlib.md5(chunk_text.encode("utf-8")).hexdigest()[:12]
1193
+
1194
+ node_type = nc.get("node_type", "unknown")
1195
+ name = nc.get("name")
1196
+
1197
+ if name:
1198
+ header = f"File: {rel_path}\n{node_type.capitalize()}: {name}\nLines: {start_line}-{end_line}"
1199
+ else:
1200
+ header = f"File: {rel_path}\nLines: {start_line}-{end_line}"
1201
+
1202
+ document = f"{header}\n\n{chunk_text}"
1203
+
1204
+ chunks.append({
1205
+ "id": f"{rel_path}:{start_line}-{end_line}:{content_hash}",
1206
+ "document": document,
1207
+ "metadata": {
1208
+ "file_path": rel_path,
1209
+ "start_line": start_line,
1210
+ "end_line": end_line,
1211
+ "language": language,
1212
+ "node_type": node_type,
1213
+ "name": name or "",
1214
+ }
1215
+ })
1216
+ if chunks:
1217
+ return chunks
1218
+
1219
+ # Use AST-aware chunking for Python files (fallback)
1220
+ if language == "py":
1221
+ chunks = self._chunk_python_ast(content, rel_path, language)
1222
+ if chunks: # If AST parsing succeeded
1223
+ return chunks
1224
+
1225
+ # Fallback: line-based chunking for other languages or if AST fails
1226
+ return self._chunk_by_lines(lines, rel_path, language)
1227
+
1228
+ def _chunk_python_ast(self, content: str, rel_path: str, language: str) -> list[dict]:
1229
+ """Parse Python file and create chunks based on function/class boundaries.
1230
+
1231
+ Each function, method, and class becomes its own chunk, preserving
1232
+ semantic boundaries for better embedding quality.
1233
+ """
1234
+ import ast
1235
+
1236
+ try:
1237
+ tree = ast.parse(content)
1238
+ except SyntaxError:
1239
+ return [] # Fall back to line-based chunking
1240
+
1241
+ lines = content.split("\n")
1242
+ chunks = []
1243
+
1244
+ def get_docstring(node: ast.AST) -> str:
1245
+ """Extract docstring from a node if present."""
1246
+ if (
1247
+ isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef))
1248
+ and node.body
1249
+ ):
1250
+ first = node.body[0]
1251
+ if isinstance(first, ast.Expr) and isinstance(first.value, ast.Constant):
1252
+ if isinstance(first.value.value, str):
1253
+ return first.value.value
1254
+ return ""
1255
+
1256
+ def get_decorators(
1257
+ node: ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef,
1258
+ ) -> list[str]:
1259
+ """Extract decorator names from a node."""
1260
+ decorators = []
1261
+ for dec in node.decorator_list:
1262
+ if isinstance(dec, ast.Name):
1263
+ decorators.append(f"@{dec.id}")
1264
+ elif isinstance(dec, ast.Attribute):
1265
+ decorators.append(f"@{ast.unparse(dec)}")
1266
+ elif isinstance(dec, ast.Call):
1267
+ if isinstance(dec.func, ast.Name):
1268
+ decorators.append(f"@{dec.func.id}")
1269
+ elif isinstance(dec.func, ast.Attribute):
1270
+ decorators.append(f"@{ast.unparse(dec.func)}")
1271
+ return decorators
1272
+
1273
+ def get_base_classes(node: ast.ClassDef) -> list[str]:
1274
+ """Extract base class names from a class definition."""
1275
+ bases = []
1276
+ for base in node.bases:
1277
+ if isinstance(base, ast.Name):
1278
+ bases.append(base.id)
1279
+ elif isinstance(base, ast.Attribute):
1280
+ bases.append(ast.unparse(base))
1281
+ else:
1282
+ bases.append(ast.unparse(base))
1283
+ return bases
1284
+
1285
+ def get_return_type(node: ast.FunctionDef | ast.AsyncFunctionDef) -> str:
1286
+ """Extract return type annotation from a function."""
1287
+ if node.returns:
1288
+ return ast.unparse(node.returns)
1289
+ return ""
1290
+
1291
+ def get_parameters(node: ast.FunctionDef | ast.AsyncFunctionDef) -> list[str]:
1292
+ """Extract parameter signatures from a function."""
1293
+ params = []
1294
+ for arg in node.args.args:
1295
+ param = arg.arg
1296
+ if arg.annotation:
1297
+ param += f": {ast.unparse(arg.annotation)}"
1298
+ params.append(param)
1299
+ return params
1300
+
1301
+ def add_chunk(
1302
+ node: ast.FunctionDef | ast.AsyncFunctionDef | ast.ClassDef,
1303
+ node_type: str,
1304
+ name: str,
1305
+ parent_class: str | None = None,
1306
+ ) -> None:
1307
+ """Add a chunk for a function/class node."""
1308
+ start_line = node.lineno
1309
+ end_line = node.end_lineno or start_line
1310
+
1311
+ # Extract the source code for this node
1312
+ chunk_lines = lines[start_line - 1 : end_line]
1313
+ chunk_text = "\n".join(chunk_lines)
1314
+ content_hash = hashlib.md5(chunk_text.encode("utf-8")).hexdigest()[:12]
1315
+
1316
+ # Skip very small chunks
1317
+ if len(chunk_lines) < 3:
1318
+ return
1319
+
1320
+ # Build descriptive header
1321
+ docstring = get_docstring(node)
1322
+ if parent_class:
1323
+ header = f"File: {rel_path}\n{node_type}: {parent_class}.{name}\nLines: {start_line}-{end_line}"
1324
+ else:
1325
+ header = f"File: {rel_path}\n{node_type}: {name}\nLines: {start_line}-{end_line}"
1326
+
1327
+ if docstring:
1328
+ header += f"\nDocstring: {docstring[:200]}..."
1329
+
1330
+ document = f"{header}\n\n{chunk_text}"
1331
+
1332
+ chunks.append(
1333
+ {
1334
+ "id": f"{rel_path}:{start_line}-{end_line}:{content_hash}",
1335
+ "document": document,
1336
+ "metadata": {
1337
+ "file_path": rel_path,
1338
+ "start_line": start_line,
1339
+ "end_line": end_line,
1340
+ "language": language,
1341
+ "node_type": node_type.lower(),
1342
+ "name": f"{parent_class}.{name}" if parent_class else name,
1343
+ # Structural metadata for filtering
1344
+ "decorators": ",".join(get_decorators(node)),
1345
+ "is_async": isinstance(node, ast.AsyncFunctionDef),
1346
+ # Class-specific metadata
1347
+ "base_classes": ",".join(get_base_classes(node))
1348
+ if isinstance(node, ast.ClassDef)
1349
+ else "",
1350
+ # Function-specific metadata
1351
+ "return_type": get_return_type(node)
1352
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
1353
+ else "",
1354
+ "parameters": ",".join(get_parameters(node))
1355
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
1356
+ else "",
1357
+ },
1358
+ }
1359
+ )
1360
+
1361
+ # Walk the AST and extract functions/classes
1362
+ for node in ast.walk(tree):
1363
+ if isinstance(node, ast.ClassDef):
1364
+ add_chunk(node, "Class", node.name)
1365
+ # Also add methods as separate chunks for granular search
1366
+ for item in node.body:
1367
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
1368
+ add_chunk(item, "Method", item.name, parent_class=node.name)
1369
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
1370
+ # Only top-level functions (not methods)
1371
+ # Check if this function is inside a class body
1372
+ is_method = False
1373
+ for parent in ast.walk(tree):
1374
+ if isinstance(parent, ast.ClassDef):
1375
+ body = getattr(parent, "body", None)
1376
+ if isinstance(body, list) and node in body:
1377
+ is_method = True
1378
+ break
1379
+ if not is_method:
1380
+ add_chunk(node, "Function", node.name)
1381
+
1382
+ # If we found no functions/classes, chunk module-level code
1383
+ if not chunks and len(lines) >= 5:
1384
+ # Add module-level chunk for imports and constants
1385
+ module_chunk = "\n".join(lines[: min(50, len(lines))])
1386
+ chunks.append(
1387
+ {
1388
+ "id": f"{rel_path}:1-{min(50, len(lines))}",
1389
+ "document": f"File: {rel_path}\nModule-level code\nLines: 1-{min(50, len(lines))}\n\n{module_chunk}",
1390
+ "metadata": {
1391
+ "file_path": rel_path,
1392
+ "start_line": 1,
1393
+ "end_line": min(50, len(lines)),
1394
+ "language": language,
1395
+ "node_type": "module",
1396
+ "name": rel_path,
1397
+ },
1398
+ }
1399
+ )
1400
+
1401
+ return chunks
1402
+
1403
+ def _chunk_by_lines(self, lines: list[str], rel_path: str, language: str) -> list[dict]:
1404
+ """Fallback line-based chunking with overlap."""
1405
+ chunks = []
1406
+
1407
+ for i in range(0, len(lines), self.CHUNK_SIZE - self.CHUNK_OVERLAP):
1408
+ chunk_lines = lines[i : i + self.CHUNK_SIZE]
1409
+ if len(chunk_lines) < 5: # Skip tiny trailing chunks
1410
+ continue
1411
+
1412
+ chunk_text = "\n".join(chunk_lines)
1413
+ content_hash = hashlib.md5(chunk_text.encode("utf-8")).hexdigest()[:12]
1414
+ start_line = i + 1
1415
+ end_line = i + len(chunk_lines)
1416
+
1417
+ # Create a searchable document with context
1418
+ document = f"File: {rel_path}\nLines: {start_line}-{end_line}\n\n{chunk_text}"
1419
+
1420
+ chunks.append(
1421
+ {
1422
+ "id": f"{rel_path}:{start_line}-{end_line}:{content_hash}",
1423
+ "document": document,
1424
+ "metadata": {
1425
+ "file_path": rel_path,
1426
+ "start_line": start_line,
1427
+ "end_line": end_line,
1428
+ "language": language,
1429
+ },
1430
+ }
1431
+ )
1432
+
1433
+ return chunks
1434
+
1435
+ def _load_whitelist(self) -> set[Path] | None:
1436
+ """Load whitelist from .stravinskyadd file if present.
1437
+
1438
+ File format:
1439
+ - One path per line (relative to project root)
1440
+ - Lines starting with # are comments
1441
+ - Empty lines are ignored
1442
+ - Glob patterns are supported (e.g., src/**/*.py)
1443
+ - Directories implicitly include all files within (src/ includes src/**/*.*)
1444
+
1445
+ Returns:
1446
+ Set of resolved file paths to include, or None if no whitelist file exists.
1447
+ """
1448
+ whitelist_file = self.project_path / ".stravinskyadd"
1449
+ if not whitelist_file.exists():
1450
+ return None
1451
+
1452
+ whitelist_paths: set[Path] = set()
1453
+ try:
1454
+ content = whitelist_file.read_text(encoding="utf-8")
1455
+ for line in content.splitlines():
1456
+ line = line.strip()
1457
+ # Skip empty lines and comments
1458
+ if not line or line.startswith("#"):
1459
+ continue
1460
+
1461
+ # Handle glob patterns
1462
+ if "*" in line or "?" in line:
1463
+ for matched_path in self.project_path.glob(line):
1464
+ if (
1465
+ matched_path.is_file()
1466
+ and matched_path.suffix.lower() in self.CODE_EXTENSIONS
1467
+ ):
1468
+ whitelist_paths.add(matched_path.resolve())
1469
+ else:
1470
+ target = self.project_path / line
1471
+ if target.exists():
1472
+ if target.is_file():
1473
+ # Direct file reference
1474
+ if target.suffix.lower() in self.CODE_EXTENSIONS:
1475
+ whitelist_paths.add(target.resolve())
1476
+ elif target.is_dir():
1477
+ # Directory: include all code files recursively
1478
+ for file_path in target.rglob("*"):
1479
+ if (
1480
+ file_path.is_file()
1481
+ and file_path.suffix.lower() in self.CODE_EXTENSIONS
1482
+ ):
1483
+ # Apply SKIP_DUW even within whitelisted directories
1484
+ if not any(
1485
+ skip_dir in file_path.parts for skip_dir in self.SKIP_DUW
1486
+ ):
1487
+ whitelist_paths.add(file_path.resolve())
1488
+
1489
+ logger.info(f"Loaded whitelist from .stravinskyadd: {len(whitelist_paths)} files")
1490
+ return whitelist_paths
1491
+
1492
+ except Exception as e:
1493
+ logger.warning(f"Failed to parse .stravinskyadd: {e}")
1494
+ return None
1495
+
1496
+ def _get_files_to_index(self) -> list[Path]:
1497
+ """Get all indexable files in the project.
1498
+
1499
+ If a .stravinskyadd whitelist file exists, ONLY those paths are indexed.
1500
+ Otherwise, all code files are indexed (excluding SKIP_DUW).
1501
+ """
1502
+ # Check for whitelist mode
1503
+ whitelist = self._load_whitelist()
1504
+ if whitelist is not None:
1505
+ logger.info(f"Whitelist mode: indexing {len(whitelist)} files from .stravinskyadd")
1506
+ return sorted(whitelist) # Return sorted for deterministic order
1507
+
1508
+ # Standard mode: crawl entire project
1509
+ files = []
1510
+ for file_path in self.project_path.rglob("*"):
1511
+ if file_path.is_file():
1512
+ # Skip files outside project boundaries (symlink traversal protection)
1513
+ try:
1514
+ resolved_file = file_path.resolve()
1515
+ resolved_project = self.project_path.resolve()
1516
+
1517
+ # Check if file is under project using parent chain with samefile()
1518
+ # This handles macOS /var → /private/var aliasing and symlinks
1519
+ found = False
1520
+ current = resolved_file.parent
1521
+ while current != current.parent: # Stop at filesystem root
1522
+ try:
1523
+ if current.samefile(resolved_project):
1524
+ found = True
1525
+ break
1526
+ except OSError:
1527
+ # samefile can fail on some filesystems; try string comparison
1528
+ if current == resolved_project:
1529
+ found = True
1530
+ break
1531
+ current = current.parent
1532
+
1533
+ if not found:
1534
+ continue # Outside project
1535
+ except (ValueError, OSError):
1536
+ continue # Outside project boundaries
1537
+
1538
+ # Skip hidden files and directories
1539
+ if any(
1540
+ part.startswith(".") for part in file_path.parts[len(self.project_path.parts) :]
1541
+ ) and file_path.suffix not in {".md", ".txt"}: # Allow .github docs
1542
+ continue
1543
+
1544
+ # Skip excluded directories
1545
+ if any(skip_dir in file_path.parts for skip_dir in self.SKIP_DUW):
1546
+ continue
1547
+
1548
+ # Only include code files
1549
+ if file_path.suffix.lower() in self.CODE_EXTENSIONS:
1550
+ files.append(file_path)
1551
+
1552
+ return files
1553
+
1554
+ def request_cancel_indexing(self) -> None:
1555
+ """Request cancellation of ongoing indexing operation.
1556
+
1557
+ Sets a flag that will be checked between batches. The operation will
1558
+ stop gracefully after completing the current batch.
1559
+ """
1560
+ with self._cancel_lock:
1561
+ self._cancel_indexing = True
1562
+ logger.info(f"Cancellation requested for {self.project_path}")
1563
+
1564
+ def clear_cancel_flag(self) -> None:
1565
+ """Clear the cancellation flag."""
1566
+ with self._cancel_lock:
1567
+ self._cancel_indexing = False
1568
+
1569
+ def is_cancellation_requested(self) -> bool:
1570
+ """Check if cancellation has been requested."""
1571
+ with self._cancel_lock:
1572
+ return self._cancel_indexing
1573
+
1574
+ def _get_manifest_path(self) -> Path:
1575
+ """Get the path to the incremental indexing manifest."""
1576
+ return self.db_path / "manifest.json"
1577
+
1578
+ def _load_manifest(self) -> dict:
1579
+ """Load the indexing manifest."""
1580
+ manifest_path = self._get_manifest_path()
1581
+ if not manifest_path.exists():
1582
+ return {}
1583
+ try:
1584
+ import json
1585
+
1586
+ with open(manifest_path, "r", encoding="utf-8") as f:
1587
+ return json.load(f)
1588
+ except Exception as e:
1589
+ logger.warning(f"Failed to load manifest: {e}")
1590
+ return {}
1591
+
1592
+ def _save_manifest(self, manifest: dict) -> None:
1593
+ """Save the indexing manifest."""
1594
+ manifest_path = self._get_manifest_path()
1595
+ try:
1596
+ import json
1597
+
1598
+ # Atomic write
1599
+ temp_path = manifest_path.with_suffix(".tmp")
1600
+ with open(temp_path, "w", encoding="utf-8") as f:
1601
+ json.dump(manifest, f, indent=2)
1602
+ temp_path.replace(manifest_path)
1603
+ except Exception as e:
1604
+ logger.warning(f"Failed to save manifest: {e}")
1605
+
1606
+ async def index_codebase(self, force: bool = False) -> dict:
1607
+ """
1608
+ Index the entire codebase into the vector store.
1609
+
1610
+ This operation can be cancelled by calling request_cancel_indexing().
1611
+ Cancellation happens between batches, so the current batch will complete.
1612
+
1613
+ Args:
1614
+ force: If True, reindex everything. Otherwise, only index new/changed files.
1615
+
1616
+ Returns:
1617
+ Statistics about the indexing operation.
1618
+ """
1619
+ import time
1620
+
1621
+ # Clear any previous cancellation requests
1622
+ self.clear_cancel_flag()
1623
+
1624
+ # Start timing
1625
+ start_time = time.time()
1626
+
1627
+ print(f"🔍 SEMANTIC-INDEX: {self.project_path}", file=sys.stderr)
1628
+
1629
+ # Notify reindex start (non-blocking)
1630
+ notifier = None
1631
+ try:
1632
+ from mcp_bridge.notifications import get_notification_manager
1633
+
1634
+ notifier = get_notification_manager()
1635
+ await notifier.notify_reindex_start(str(self.project_path))
1636
+ except Exception as e:
1637
+ logger.warning(f"Failed to send reindex start notification: {e}")
1638
+
1639
+ try:
1640
+ if not await self.check_embedding_service():
1641
+ error_msg = "Embedding service not available"
1642
+ try:
1643
+ if notifier:
1644
+ await notifier.notify_reindex_error(error_msg)
1645
+ except Exception as e:
1646
+ logger.warning(f"Failed to send reindex error notification: {e}")
1647
+ return {"error": error_msg, "indexed": 0}
1648
+
1649
+ # Get existing document IDs
1650
+ existing_ids = set()
1651
+ try:
1652
+ # Only fetch IDs to minimize overhead
1653
+ existing = self.collection.get(include=[])
1654
+ existing_ids = set(existing["ids"]) if existing["ids"] else set()
1655
+ except Exception:
1656
+ pass
1657
+
1658
+ manifest = {}
1659
+ if force:
1660
+ # Clear existing collection and manifest
1661
+ try:
1662
+ self.client.delete_collection("codebase")
1663
+ self._collection = None
1664
+ existing_ids = set()
1665
+ except Exception:
1666
+ pass
1667
+ else:
1668
+ manifest = self._load_manifest()
1669
+
1670
+ files = self._get_files_to_index()
1671
+ all_chunks = []
1672
+ current_chunk_ids = set()
1673
+
1674
+ # Track manifest updates
1675
+ new_manifest = {}
1676
+
1677
+ # Stats
1678
+ reused_files = 0
1679
+
1680
+ # Mark: Generate all chunks for current codebase
1681
+ for file_path in files:
1682
+ str_path = str(file_path.resolve())
1683
+
1684
+ # Get file stats
1685
+ try:
1686
+ stat = file_path.stat()
1687
+ mtime = stat.st_mtime
1688
+ size = stat.st_size
1689
+ except OSError:
1690
+ continue # File might have been deleted during iteration
1691
+
1692
+ # Check manifest
1693
+ manifest_entry = manifest.get(str_path)
1694
+
1695
+ # Reuse chunks if file hasn't changed AND chunks exist in DB
1696
+ if (
1697
+ not force
1698
+ and manifest_entry
1699
+ and manifest_entry.get("mtime") == mtime
1700
+ and manifest_entry.get("size") == size
1701
+ ):
1702
+ chunk_ids = manifest_entry.get("chunk_ids", [])
1703
+
1704
+ # Verify all chunks actually exist in DB (integrity check)
1705
+ if chunk_ids and all(cid in existing_ids for cid in chunk_ids):
1706
+ current_chunk_ids.update(chunk_ids)
1707
+ new_manifest[str_path] = manifest_entry
1708
+ reused_files += 1
1709
+ continue
1710
+
1711
+ # If we get here: file changed, new, or chunks missing from DB
1712
+ chunks = await self._chunk_file(file_path)
1713
+ all_chunks.extend(chunks)
1714
+
1715
+ new_chunk_ids = []
1716
+ for c in chunks:
1717
+ cid = c["id"]
1718
+ current_chunk_ids.add(cid)
1719
+ new_chunk_ids.append(cid)
1720
+
1721
+ # Update manifest
1722
+ new_manifest[str_path] = {"mtime": mtime, "size": size, "chunk_ids": new_chunk_ids}
1723
+
1724
+ # Save updated manifest
1725
+ self._save_manifest(new_manifest)
1726
+
1727
+ # Sweep: Identify stale chunks to remove
1728
+ to_delete = existing_ids - current_chunk_ids
1729
+
1730
+ # Identify new chunks to add
1731
+ to_add_ids = current_chunk_ids - existing_ids
1732
+ chunks_to_add = [c for c in all_chunks if c["id"] in to_add_ids]
1733
+
1734
+ # Prune stale chunks
1735
+ if to_delete:
1736
+ print(f" Pruning {len(to_delete)} stale chunks...", file=sys.stderr)
1737
+ self.collection.delete(ids=list(to_delete))
1738
+
1739
+ if not chunks_to_add:
1740
+ stats = {
1741
+ "indexed": 0,
1742
+ "pruned": len(to_delete),
1743
+ "total_files": len(files),
1744
+ "reused_files": reused_files,
1745
+ "message": f"No new chunks to index (reused {reused_files} files)",
1746
+ "time_taken": round(time.time() - start_time, 1),
1747
+ }
1748
+ try:
1749
+ if notifier:
1750
+ await notifier.notify_reindex_complete(stats)
1751
+ except Exception as e:
1752
+ logger.warning(f"Failed to send reindex complete notification: {e}")
1753
+ return stats
1754
+
1755
+ # Batch embed and store
1756
+ batch_size = 50
1757
+ total_indexed = 0
1758
+
1759
+ for i in range(0, len(chunks_to_add), batch_size):
1760
+ # Check for cancellation between batches
1761
+ if self.is_cancellation_requested():
1762
+ print(f" ⚠️ Indexing cancelled after {total_indexed} chunks", file=sys.stderr)
1763
+ stats = {
1764
+ "indexed": total_indexed,
1765
+ "pruned": len(to_delete),
1766
+ "total_files": len(files),
1767
+ "db_path": str(self.db_path),
1768
+ "time_taken": round(time.time() - start_time, 1),
1769
+ "cancelled": True,
1770
+ "message": f"Cancelled after {total_indexed}/{len(chunks_to_add)} chunks",
1771
+ }
1772
+ try:
1773
+ if notifier:
1774
+ await notifier.notify_reindex_error(
1775
+ f"Indexing cancelled by user after {total_indexed} chunks"
1776
+ )
1777
+ except Exception as e:
1778
+ logger.warning(f"Failed to send cancellation notification: {e}")
1779
+ return stats
1780
+
1781
+ batch = chunks_to_add[i : i + batch_size]
1782
+
1783
+ documents = [c["document"] for c in batch]
1784
+ embeddings = await self.get_embeddings_batch(documents)
1785
+
1786
+ self.collection.add(
1787
+ ids=[c["id"] for c in batch],
1788
+ documents=documents,
1789
+ embeddings=embeddings, # type: ignore[arg-type]
1790
+ metadatas=[c["metadata"] for c in batch],
1791
+ )
1792
+ total_indexed += len(batch)
1793
+ print(f" Indexed {total_indexed}/{len(chunks_to_add)} chunks...", file=sys.stderr)
1794
+
1795
+ stats = {
1796
+ "indexed": total_indexed,
1797
+ "pruned": len(to_delete),
1798
+ "total_files": len(files),
1799
+ "reused_files": reused_files,
1800
+ "db_path": str(self.db_path),
1801
+ "time_taken": round(time.time() - start_time, 1),
1802
+ }
1803
+
1804
+ try:
1805
+ if notifier:
1806
+ await notifier.notify_reindex_complete(stats)
1807
+ except Exception as e:
1808
+ logger.warning(f"Failed to send reindex complete notification: {e}")
1809
+
1810
+ return stats
1811
+
1812
+ except Exception as e:
1813
+ error_msg = str(e)
1814
+ logger.error(f"Reindexing failed: {error_msg}")
1815
+ try:
1816
+ if notifier:
1817
+ await notifier.notify_reindex_error(error_msg)
1818
+ except Exception as notify_error:
1819
+ logger.warning(f"Failed to send reindex error notification: {notify_error}")
1820
+ raise
1821
+
1822
+ async def search(
1823
+ self,
1824
+ query: str,
1825
+ n_results: int = 10,
1826
+ language: str | None = None,
1827
+ node_type: str | None = None,
1828
+ decorator: str | None = None,
1829
+ is_async: bool | None = None,
1830
+ base_class: str | None = None,
1831
+ ) -> list[dict]:
1832
+ """
1833
+ Search the codebase with a natural language query.
1834
+
1835
+ Args:
1836
+ query: Natural language search query
1837
+ n_results: Maximum number of results to return
1838
+ language: Filter by language (e.g., "py", "ts", "js")
1839
+ node_type: Filter by node type (e.g., "function", "class", "method")
1840
+ decorator: Filter by decorator (e.g., "@property", "@staticmethod")
1841
+ is_async: Filter by async status (True = async only, False = sync only)
1842
+ base_class: Filter by base class (e.g., "BaseClass")
1843
+
1844
+ Returns:
1845
+ List of matching code chunks with metadata.
1846
+ """
1847
+ filters = []
1848
+ if language:
1849
+ filters.append(f"language={language}")
1850
+ if node_type:
1851
+ filters.append(f"node_type={node_type}")
1852
+ if decorator:
1853
+ filters.append(f"decorator={decorator}")
1854
+ if is_async is not None:
1855
+ filters.append(f"is_async={is_async}")
1856
+ if base_class:
1857
+ filters.append(f"base_class={base_class}")
1858
+ filter_str = f" [{', '.join(filters)}]" if filters else ""
1859
+ print(f"🔎 SEMANTIC-SEARCH: '{query[:50]}...'{filter_str}", file=sys.stderr)
1860
+
1861
+ if not await self.check_embedding_service():
1862
+ return [{"error": "Embedding service not available"}]
1863
+
1864
+ # Check if collection has documents
1865
+ try:
1866
+ count = self.collection.count()
1867
+ if count == 0:
1868
+ return [{"error": "No documents indexed", "hint": "Run index_codebase first"}]
1869
+ except Exception as e:
1870
+ return [{"error": f"Collection error: {e}"}]
1871
+
1872
+ # Get query embedding
1873
+ query_embedding = await self.get_embedding(query)
1874
+
1875
+ # Build where clause for metadata filtering
1876
+ where_filters = []
1877
+ if language:
1878
+ where_filters.append({"language": language})
1879
+ if node_type:
1880
+ where_filters.append({"node_type": node_type.lower()})
1881
+ if decorator:
1882
+ # ChromaDB $like for substring match in comma-separated field
1883
+ # Use % wildcards for pattern matching
1884
+ where_filters.append({"decorators": {"$like": f"%{decorator}%"}})
1885
+ if is_async is not None:
1886
+ where_filters.append({"is_async": is_async})
1887
+ if base_class:
1888
+ # Use $like for substring match
1889
+ where_filters.append({"base_classes": {"$like": f"%{base_class}%"}})
1890
+
1891
+ where_clause = None
1892
+ if len(where_filters) == 1:
1893
+ where_clause = where_filters[0]
1894
+ elif len(where_filters) > 1:
1895
+ where_clause = {"$and": where_filters}
1896
+
1897
+ # Search with optional filtering
1898
+ query_kwargs: dict = {
1899
+ "query_embeddings": [query_embedding],
1900
+ "n_results": n_results,
1901
+ "include": ["documents", "metadatas", "distances"],
1902
+ }
1903
+ if where_clause:
1904
+ query_kwargs["where"] = where_clause
1905
+
1906
+ results = self.collection.query(**query_kwargs)
1907
+
1908
+ # Format results
1909
+ formatted = []
1910
+ if results["ids"] and results["ids"][0]:
1911
+ for i, _doc_id in enumerate(results["ids"][0]):
1912
+ metadata = results["metadatas"][0][i] if results["metadatas"] else {}
1913
+ distance = results["distances"][0][i] if results["distances"] else 0
1914
+ document = results["documents"][0][i] if results["documents"] else ""
1915
+
1916
+ # Extract just the code part (skip file/line header)
1917
+ code_lines = document.split("\n\n", 1)
1918
+ code = code_lines[1] if len(code_lines) > 1 else document
1919
+
1920
+ formatted.append(
1921
+ {
1922
+ "file": metadata.get("file_path", "unknown"),
1923
+ "lines": f"{metadata.get('start_line', '?')}-{metadata.get('end_line', '?')}",
1924
+ "language": metadata.get("language", ""),
1925
+ "relevance": round(1 - distance, 3), # Convert distance to similarity
1926
+ "code_preview": code[:500] + "..." if len(code) > 500 else code,
1927
+ }
1928
+ )
1929
+
1930
+ return formatted
1931
+
1932
+ def get_stats(self) -> dict:
1933
+ """Get statistics about the vector store."""
1934
+ try:
1935
+ count = self.collection.count()
1936
+ return {
1937
+ "project_path": str(self.project_path),
1938
+ "db_path": str(self.db_path),
1939
+ "chunks_indexed": count,
1940
+ "embedding_provider": self.provider.name,
1941
+ "embedding_dimension": self.provider.dimension,
1942
+ }
1943
+ except Exception as e:
1944
+ return {"error": str(e)}
1945
+
1946
+ def start_watching(self, debounce_seconds: float = 2.0) -> "CodebaseFileWatcher":
1947
+ """Start watching the project directory for file changes.
1948
+
1949
+ Args:
1950
+ debounce_seconds: Time to wait before reindexing after changes (default: 2.0s)
1951
+
1952
+ Returns:
1953
+ The CodebaseFileWatcher instance
1954
+ """
1955
+ with self._watcher_lock:
1956
+ if self._watcher is None:
1957
+ # Avoid circular import by importing here
1958
+ self._watcher = CodebaseFileWatcher(
1959
+ project_path=self.project_path,
1960
+ store=self,
1961
+ debounce_seconds=debounce_seconds,
1962
+ )
1963
+ self._watcher.start()
1964
+ else:
1965
+ if not self._watcher.is_running():
1966
+ self._watcher.start()
1967
+ else:
1968
+ logger.warning(f"Watcher for {self.project_path} is already running")
1969
+ return self._watcher
1970
+
1971
+ def stop_watching(self) -> bool:
1972
+ """Stop watching the project directory.
1973
+
1974
+ Returns:
1975
+ True if watcher was stopped, False if no watcher was active
1976
+ """
1977
+ with self._watcher_lock:
1978
+ if self._watcher is not None:
1979
+ self._watcher.stop()
1980
+ self._watcher = None
1981
+ return True
1982
+ return False
1983
+
1984
+ def is_watching(self) -> bool:
1985
+ """Check if the project directory is being watched.
1986
+
1987
+ Returns:
1988
+ True if watcher is active and running, False otherwise
1989
+ """
1990
+ with self._watcher_lock:
1991
+ if self._watcher is not None:
1992
+ return self._watcher.is_running()
1993
+ return False
1994
+
1995
+
1996
+ # --- Module-level API for MCP tools ---
1997
+
1998
+ _stores: dict[str, CodebaseVectorStore] = {}
1999
+ _stores_lock = threading.Lock()
2000
+
2001
+ # Module-level watcher management
2002
+ _watchers: dict[str, "CodebaseFileWatcher"] = {}
2003
+ _watchers_lock = threading.Lock()
2004
+
2005
+
2006
+ def _cleanup_watchers():
2007
+ """Cleanup function to stop all watchers on exit.
2008
+
2009
+ Registered with atexit to ensure graceful shutdown when Python exits normally.
2010
+ Note: This won't be called if the process is killed (SIGKILL) or crashes.
2011
+ """
2012
+ with _watchers_lock:
2013
+ for path, watcher in list(_watchers.items()):
2014
+ try:
2015
+ logger.debug(f"Stopping watcher for {path} on exit")
2016
+ watcher.stop()
2017
+ except Exception as e:
2018
+ logger.warning(f"Error stopping watcher for {path}: {e}")
2019
+
2020
+
2021
+ # Register cleanup handler for graceful shutdown
2022
+ atexit.register(_cleanup_watchers)
2023
+
2024
+
2025
+ def _check_index_exists(store: "CodebaseVectorStore") -> bool:
2026
+ """Check if semantic index exists for this project."""
2027
+ try:
2028
+ doc_count = store.collection.count()
2029
+ return doc_count > 0
2030
+ except Exception as e:
2031
+ logger.warning(f"Could not check index status: {e}")
2032
+ return False
2033
+
2034
+
2035
+ def _prompt_with_timeout(prompt_text: str, timeout: int = 30) -> str:
2036
+ """
2037
+ Prompt user with timeout. Returns 'n' if timeout or non-interactive.
2038
+
2039
+ Args:
2040
+ prompt_text: The prompt to display
2041
+ timeout: Timeout in seconds (default: 30)
2042
+
2043
+ Returns:
2044
+ User response or 'n' if timeout/non-interactive
2045
+ """
2046
+ # Check if stdin is interactive
2047
+ if not sys.stdin.isatty():
2048
+ return "n" # Non-interactive, skip prompt
2049
+
2050
+ # Windows doesn't support SIGALRM, so we need a different approach
2051
+ if sys.platform == "win32":
2052
+ try:
2053
+ import msvcrt
2054
+ import time
2055
+
2056
+ print(prompt_text, end="", flush=True, file=sys.stderr)
2057
+ start_time = time.time()
2058
+ response = []
2059
+
2060
+ while time.time() - start_time < timeout:
2061
+ if msvcrt.kbhit():
2062
+ char = msvcrt.getwche()
2063
+ if char in ("\r", "\n"):
2064
+ print(file=sys.stderr) # Newline after input
2065
+ return "".join(response)
2066
+ response.append(char)
2067
+ time.sleep(0.1)
2068
+
2069
+ print("\n⏱️ Timeout - skipping index creation", file=sys.stderr)
2070
+ return "n"
2071
+ except (ImportError, Exception):
2072
+ # Fallback: just use input() without timeout on Windows
2073
+ try:
2074
+ return input(prompt_text)
2075
+ except EOFError:
2076
+ return "n"
2077
+
2078
+ # Unix-like systems (Linux, macOS)
2079
+ def timeout_handler(signum, frame):
2080
+ raise TimeoutError()
2081
+
2082
+ try:
2083
+ # Save old handler
2084
+ old_handler = signal.signal(signal.SIGALRM, timeout_handler)
2085
+ signal.alarm(timeout)
2086
+ response = input(prompt_text)
2087
+ signal.alarm(0) # Cancel alarm
2088
+ # Restore old handler
2089
+ signal.signal(signal.SIGALRM, old_handler)
2090
+ return response
2091
+ except (TimeoutError, EOFError):
2092
+ signal.alarm(0) # Cancel alarm
2093
+ # Restore old handler
2094
+ try:
2095
+ signal.signal(signal.SIGALRM, old_handler)
2096
+ except Exception:
2097
+ pass
2098
+ print("\n⏱️ Timeout - skipping index creation", file=sys.stderr)
2099
+ return "n"
2100
+ except Exception as e:
2101
+ signal.alarm(0) # Cancel alarm
2102
+ logger.warning(f"Error during prompt: {e}")
2103
+ return "n"
2104
+
2105
+
2106
+ def get_store(project_path: str, provider: EmbeddingProvider = "ollama") -> CodebaseVectorStore:
2107
+ """Get or create a vector store for a project.
2108
+
2109
+ Note: Cache key includes provider to prevent cross-provider conflicts
2110
+ (different providers have different embedding dimensions).
2111
+ """
2112
+ path = str(Path(project_path).resolve())
2113
+ cache_key = f"{path}:{provider}"
2114
+ if cache_key not in _stores:
2115
+ with _stores_lock:
2116
+ # Double-check pattern to avoid race condition
2117
+ if cache_key not in _stores:
2118
+ _stores[cache_key] = CodebaseVectorStore(path, provider)
2119
+ return _stores[cache_key]
2120
+
2121
+
2122
+ async def semantic_search(
2123
+ query: str,
2124
+ project_path: str = ".",
2125
+ n_results: int = 10,
2126
+ language: str | None = None,
2127
+ node_type: str | None = None,
2128
+ decorator: str | None = None,
2129
+ is_async: bool | None = None,
2130
+ base_class: str | None = None,
2131
+ provider: EmbeddingProvider = "ollama",
2132
+ ) -> str:
2133
+ """
2134
+ Search codebase with natural language query.
2135
+
2136
+ Args:
2137
+ query: Natural language search query (e.g., "find authentication logic")
2138
+ project_path: Path to the project root
2139
+ n_results: Maximum number of results to return
2140
+ language: Filter by language (e.g., "py", "ts", "js")
2141
+ node_type: Filter by node type (e.g., "function", "class", "method")
2142
+ decorator: Filter by decorator (e.g., "@property", "@staticmethod")
2143
+ is_async: Filter by async status (True = async only, False = sync only)
2144
+ base_class: Filter by base class (e.g., "BaseClass")
2145
+ provider: Embedding provider (ollama, mxbai, gemini, openai, huggingface)
2146
+
2147
+ Returns:
2148
+ Formatted search results with file paths and code snippets.
2149
+ """
2150
+ store = get_store(project_path, provider)
2151
+
2152
+ # Check if index exists before searching
2153
+ if not _check_index_exists(store):
2154
+ print("\n⚠️ No semantic index found for this project.", file=sys.stderr)
2155
+ print(f"📁 Project: {project_path}", file=sys.stderr)
2156
+ print(f"🔍 Provider: {provider}", file=sys.stderr)
2157
+
2158
+ # Interactive prompt with timeout
2159
+ response = _prompt_with_timeout("\n🤔 Create semantic index now? [Y/n] (30s timeout): ")
2160
+
2161
+ if response.lower() in ["", "y", "yes"]:
2162
+ print("\n📋 Creating semantic index...", file=sys.stderr)
2163
+ try:
2164
+ # Call index_codebase function
2165
+ index_result = await index_codebase(project_path, provider=provider, force=False)
2166
+ print(f"✅ {index_result}", file=sys.stderr)
2167
+
2168
+ # Auto-start file watcher
2169
+ print("🔄 Starting file watcher for auto-updates...", file=sys.stderr)
2170
+ await start_file_watcher(project_path, provider)
2171
+ print("✅ File watcher started - index will auto-update on changes", file=sys.stderr)
2172
+
2173
+ except Exception as e:
2174
+ logger.error(f"Failed to create index: {e}")
2175
+ return (
2176
+ f"❌ Failed to create index: {e}\n\n"
2177
+ "**Manual fix:**\n"
2178
+ "```python\n"
2179
+ f'index_codebase(project_path="{project_path}", provider="{provider}")\n'
2180
+ "```"
2181
+ )
2182
+ else:
2183
+ return (
2184
+ "❌ Index required for semantic search.\n\n"
2185
+ "**To create index manually:**\n"
2186
+ "```python\n"
2187
+ f'index_codebase(project_path="{project_path}", provider="{provider}")\n'
2188
+ "```\n\n"
2189
+ "This indexes your codebase for natural language search. "
2190
+ "Run it once per project (takes 30s-2min depending on size)."
2191
+ )
2192
+ else:
2193
+ # Index exists, ensure watcher is running
2194
+ # We don't await this to avoid blocking search if it takes a moment
2195
+ # But for tests we might need to await it or mock it properly
2196
+ # The test expects it to be called.
2197
+ # Let's just call it. start_file_watcher is async.
2198
+ try:
2199
+ await start_file_watcher(project_path, provider)
2200
+ except Exception as e:
2201
+ logger.warning(f"Failed to auto-start watcher: {e}")
2202
+
2203
+ results = await store.search(
2204
+ query,
2205
+ n_results,
2206
+ language,
2207
+ node_type,
2208
+ decorator=decorator,
2209
+ is_async=is_async,
2210
+ base_class=base_class,
2211
+ )
2212
+
2213
+ if not results:
2214
+ return "No results found"
2215
+
2216
+ if "error" in results[0]:
2217
+ return f"Error: {results[0]['error']}\nHint: {results[0].get('hint', 'Check Ollama is running')}"
2218
+
2219
+ # Auto-start file watcher if not already running (index exists and search succeeded)
2220
+ try:
2221
+ active_watcher = get_file_watcher(project_path)
2222
+ if active_watcher is None:
2223
+ # Index exists but no watcher - start it silently in background
2224
+ logger.info(f"Auto-starting file watcher for {project_path}")
2225
+ await start_file_watcher(project_path, provider, debounce_seconds=2.0)
2226
+ except Exception as e:
2227
+ # Don't fail the search if watcher fails to start
2228
+ logger.warning(f"Could not auto-start file watcher: {e}")
2229
+
2230
+ lines = [f"Found {len(results)} results for: '{query}'\n"]
2231
+ for i, r in enumerate(results, 1):
2232
+ lines.append(f"{i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
2233
+ lines.append(f"```{r['language']}")
2234
+ lines.append(r["code_preview"])
2235
+ lines.append("```\n")
2236
+
2237
+ return "\n".join(lines)
2238
+
2239
+
2240
+ async def hybrid_search(
2241
+ query: str,
2242
+ pattern: str | None = None,
2243
+ project_path: str = ".",
2244
+ n_results: int = 10,
2245
+ language: str | None = None,
2246
+ node_type: str | None = None,
2247
+ decorator: str | None = None,
2248
+ is_async: bool | None = None,
2249
+ base_class: str | None = None,
2250
+ provider: EmbeddingProvider = "ollama",
2251
+ ) -> str:
2252
+ """
2253
+ Hybrid search combining semantic similarity with structural AST matching.
2254
+
2255
+ Performs semantic search first, then optionally filters/boosts results
2256
+ that also match an ast-grep structural pattern.
2257
+
2258
+ Args:
2259
+ query: Natural language search query (e.g., "find authentication logic")
2260
+ pattern: Optional ast-grep pattern for structural matching (e.g., "def $FUNC($$$):")
2261
+ project_path: Path to the project root
2262
+ n_results: Maximum number of results to return
2263
+ language: Filter by language (e.g., "py", "ts", "js")
2264
+ node_type: Filter by node type (e.g., "function", "class", "method")
2265
+ decorator: Filter by decorator (e.g., "@property", "@staticmethod")
2266
+ is_async: Filter by async status (True = async only, False = sync only)
2267
+ base_class: Filter by base class (e.g., "BaseClass")
2268
+ provider: Embedding provider (ollama, gemini, openai)
2269
+
2270
+ Returns:
2271
+ Formatted search results with relevance scores and structural match indicators.
2272
+ """
2273
+ from mcp_bridge.tools.code_search import ast_grep_search
2274
+
2275
+ # Get semantic results (fetch more if we're going to filter)
2276
+ fetch_count = n_results * 2 if pattern else n_results
2277
+ semantic_result = await semantic_search(
2278
+ query=query,
2279
+ project_path=project_path,
2280
+ n_results=fetch_count,
2281
+ language=language,
2282
+ node_type=node_type,
2283
+ decorator=decorator,
2284
+ is_async=is_async,
2285
+ base_class=base_class,
2286
+ provider=provider,
2287
+ )
2288
+
2289
+ if not pattern:
2290
+ return semantic_result
2291
+
2292
+ if semantic_result.startswith("Error:") or semantic_result == "No results found":
2293
+ return semantic_result
2294
+
2295
+ # Get structural matches from ast-grep
2296
+ ast_result = await ast_grep_search(
2297
+ pattern=pattern,
2298
+ directory=project_path,
2299
+ language=language or "",
2300
+ )
2301
+
2302
+ # Extract file paths from ast-grep results
2303
+ ast_files: set[str] = set()
2304
+ if ast_result and not ast_result.startswith("Error:") and ast_result != "No matches found":
2305
+ for line in ast_result.split("\n"):
2306
+ if line.startswith("- "):
2307
+ # Format: "- file.py:123"
2308
+ file_part = line[2:].split(":")[0]
2309
+ ast_files.add(file_part)
2310
+
2311
+ if not ast_files:
2312
+ # No structural matches, return semantic results with note
2313
+ return f"{semantic_result}\n\n[Note: No structural matches for pattern '{pattern}']"
2314
+
2315
+ # Parse semantic results and boost/annotate files that appear in both
2316
+ lines = []
2317
+ result_lines = semantic_result.split("\n")
2318
+ header = result_lines[0] if result_lines else ""
2319
+ lines.append(header.replace("results for:", "hybrid results for:"))
2320
+ lines.append(f"[Structural pattern: {pattern}]\n")
2321
+
2322
+ i = 1
2323
+ boosted_count = 0
2324
+ while i < len(result_lines):
2325
+ line = result_lines[i]
2326
+ if line and (line[0].isdigit() or line.startswith("```") or line.strip()):
2327
+ # Check if this is a result header line (e.g., "1. file.py:10-20")
2328
+ if line and line[0].isdigit() and "." in line:
2329
+ file_part = line.split()[1].split(":")[0] if len(line.split()) > 1 else ""
2330
+ if file_part in ast_files:
2331
+ lines.append(f"{line} 🎯 [structural match]")
2332
+ boosted_count += 1
2333
+ else:
2334
+ lines.append(line)
2335
+ else:
2336
+ lines.append(line)
2337
+ else:
2338
+ lines.append(line)
2339
+ i += 1
2340
+
2341
+ lines.append(
2342
+ f"\n[{boosted_count}/{len(ast_files)} semantic results also match structural pattern]"
2343
+ )
2344
+
2345
+ return "\n".join(lines)
2346
+
2347
+
2348
+ async def index_codebase(
2349
+ project_path: str = ".",
2350
+ force: bool = False,
2351
+ provider: EmbeddingProvider = "ollama",
2352
+ ) -> str:
2353
+ """
2354
+ Index a codebase for semantic search.
2355
+
2356
+ Args:
2357
+ project_path: Path to the project root
2358
+ force: If True, reindex everything. Otherwise, only new/changed files.
2359
+ provider: Embedding provider - ollama (local/free), mxbai (local/free),
2360
+ gemini (cloud/OAuth), openai (cloud/OAuth), huggingface (cloud/token)
2361
+
2362
+ Returns:
2363
+ Indexing statistics.
2364
+ """
2365
+ store = get_store(project_path, provider)
2366
+ stats = await store.index_codebase(force=force)
2367
+
2368
+ if "error" in stats:
2369
+ return f"Error: {stats['error']}"
2370
+
2371
+ if stats.get("cancelled"):
2372
+ return (
2373
+ f"⚠️ Indexing cancelled\n"
2374
+ f"Indexed {stats['indexed']} chunks from {stats['total_files']} files before cancellation\n"
2375
+ f"{stats.get('message', '')}"
2376
+ )
2377
+
2378
+ return (
2379
+ f"Indexed {stats['indexed']} chunks from {stats['total_files']} files\n"
2380
+ f"Database: {stats.get('db_path', 'unknown')}\n"
2381
+ f"{stats.get('message', '')}"
2382
+ )
2383
+
2384
+
2385
+ def cancel_indexing(
2386
+ project_path: str = ".",
2387
+ provider: EmbeddingProvider = "ollama",
2388
+ ) -> str:
2389
+ """
2390
+ Cancel an ongoing indexing operation.
2391
+
2392
+ The cancellation happens gracefully between batches - the current batch
2393
+ will complete before the operation stops.
2394
+
2395
+ Args:
2396
+ project_path: Path to the project root
2397
+ provider: Embedding provider (must match the one used for indexing)
2398
+
2399
+ Returns:
2400
+ Confirmation message.
2401
+ """
2402
+ try:
2403
+ store = get_store(project_path, provider)
2404
+ store.request_cancel_indexing()
2405
+ return f"✅ Cancellation requested for {project_path}\nIndexing will stop after current batch completes."
2406
+ except Exception as e:
2407
+ return f"❌ Error requesting cancellation: {e}"
2408
+
2409
+
2410
+ async def semantic_stats(
2411
+ project_path: str = ".",
2412
+ provider: EmbeddingProvider = "ollama",
2413
+ ) -> str:
2414
+ """
2415
+ Get statistics about the semantic search index.
2416
+
2417
+ Args:
2418
+ project_path: Path to the project root
2419
+ provider: Embedding provider - ollama (local/free), mxbai (local/free),
2420
+ gemini (cloud/OAuth), openai (cloud/OAuth), huggingface (cloud/token)
2421
+
2422
+ Returns:
2423
+ Index statistics.
2424
+ """
2425
+ store = get_store(project_path, provider)
2426
+ stats = store.get_stats()
2427
+
2428
+ if "error" in stats:
2429
+ return f"Error: {stats['error']}"
2430
+
2431
+ return (
2432
+ f"Project: {stats['project_path']}\n"
2433
+ f"Database: {stats['db_path']}\n"
2434
+ f"Chunks indexed: {stats['chunks_indexed']}\n"
2435
+ f"Embedding provider: {stats['embedding_provider']} ({stats['embedding_dimension']} dims)"
2436
+ )
2437
+
2438
+
2439
+ def delete_index(
2440
+ project_path: str = ".",
2441
+ provider: EmbeddingProvider | None = None,
2442
+ delete_all: bool = False,
2443
+ ) -> str:
2444
+ """
2445
+ Delete semantic search index for a project.
2446
+
2447
+ Args:
2448
+ project_path: Path to the project root
2449
+ provider: Embedding provider (if None and delete_all=False, deletes all providers for this project)
2450
+ delete_all: If True, delete ALL indexes for ALL projects (ignores project_path and provider)
2451
+
2452
+ Returns:
2453
+ Confirmation message with deleted paths.
2454
+ """
2455
+ import shutil
2456
+
2457
+ vectordb_base = Path.home() / ".stravinsky" / "vectordb"
2458
+
2459
+ if not vectordb_base.exists():
2460
+ return "✅ No semantic search indexes found (vectordb directory doesn't exist)"
2461
+
2462
+ if delete_all:
2463
+ # Delete entire vectordb directory
2464
+ try:
2465
+ shutil.rmtree(vectordb_base)
2466
+ return "✅ Deleted all semantic search indexes for all projects"
2467
+ except Exception as e:
2468
+ return f"❌ Error deleting all indexes: {e}"
2469
+
2470
+ # Generate repo name
2471
+ project_path_resolved = Path(project_path).resolve()
2472
+ repo_name = project_path_resolved.name
2473
+
2474
+ deleted = []
2475
+ errors = []
2476
+
2477
+ if provider:
2478
+ # Delete specific provider index for this project
2479
+ index_path = vectordb_base / f"{repo_name}_{provider}"
2480
+ if index_path.exists():
2481
+ try:
2482
+ shutil.rmtree(index_path)
2483
+ deleted.append(str(index_path))
2484
+ except Exception as e:
2485
+ errors.append(f"{provider}: {e}")
2486
+ else:
2487
+ errors.append(f"{provider}: Index not found")
2488
+ else:
2489
+ # Delete all provider indexes for this project
2490
+ providers: list[EmbeddingProvider] = ["ollama", "mxbai", "gemini", "openai", "huggingface"]
2491
+ for prov in providers:
2492
+ index_path = vectordb_base / f"{repo_name}_{prov}"
2493
+ if index_path.exists():
2494
+ try:
2495
+ shutil.rmtree(index_path)
2496
+ deleted.append(str(index_path))
2497
+ except Exception as e:
2498
+ errors.append(f"{prov}: {e}")
2499
+
2500
+ if not deleted and not errors:
2501
+ return f"⚠️ No indexes found for project: {project_path_resolved}\nRepo name: {repo_name}"
2502
+
2503
+ result = []
2504
+ if deleted:
2505
+ result.append(f"✅ Deleted {len(deleted)} index(es):")
2506
+ for path in deleted:
2507
+ result.append(f" - {path}")
2508
+ if errors:
2509
+ result.append(f"\n❌ Errors ({len(errors)}):")
2510
+ for error in errors:
2511
+ result.append(f" - {error}")
2512
+
2513
+ return "\n".join(result)
2514
+
2515
+
2516
+ async def semantic_health(project_path: str = ".", provider: EmbeddingProvider = "ollama") -> str:
2517
+ """Check health of semantic search system."""
2518
+ store = get_store(project_path, provider)
2519
+
2520
+ status = []
2521
+
2522
+ # Check Provider
2523
+ try:
2524
+ is_avail = await store.check_embedding_service()
2525
+ status.append(
2526
+ f"Provider ({store.provider.name}): {'✅ Online' if is_avail else '❌ Offline'}"
2527
+ )
2528
+ except Exception as e:
2529
+ status.append(f"Provider ({store.provider.name}): ❌ Error - {e}")
2530
+
2531
+ # Check DB
2532
+ try:
2533
+ count = store.collection.count()
2534
+ status.append(f"Vector DB: ✅ Online ({count} documents)")
2535
+ except Exception as e:
2536
+ status.append(f"Vector DB: ❌ Error - {e}")
2537
+
2538
+ return "\n".join(status)
2539
+
2540
+
2541
+ # ========================
2542
+ # FILE WATCHER MANAGEMENT
2543
+ # ========================
2544
+
2545
+
2546
+ async def start_file_watcher(
2547
+ project_path: str,
2548
+ provider: EmbeddingProvider = "ollama",
2549
+ debounce_seconds: float = 2.0,
2550
+ ) -> "CodebaseFileWatcher":
2551
+ """Start watching a project directory for file changes.
2552
+
2553
+ If an index exists, automatically performs an incremental reindex to catch up
2554
+ on any changes that happened while the watcher was not running.
2555
+
2556
+ Args:
2557
+ project_path: Path to the project root
2558
+ provider: Embedding provider to use for reindexing
2559
+ debounce_seconds: Time to wait before reindexing after changes
2560
+
2561
+ Returns:
2562
+ The started CodebaseFileWatcher instance
2563
+ """
2564
+ normalized_path = CodebaseVectorStore._normalize_project_path(project_path)
2565
+ path_key = str(normalized_path)
2566
+
2567
+ with _watchers_lock:
2568
+ if path_key not in _watchers:
2569
+ store = get_store(project_path, provider)
2570
+
2571
+ # Check if index exists - create if missing, update if stale
2572
+ try:
2573
+ stats = store.get_stats()
2574
+ chunks_indexed = stats.get("chunks_indexed", 0)
2575
+
2576
+ if chunks_indexed == 0:
2577
+ # No index exists - create initial index
2578
+ print("📋 No index found, creating initial index...", file=sys.stderr)
2579
+ await store.index_codebase(force=False)
2580
+ print("✅ Initial index created, starting file watcher", file=sys.stderr)
2581
+ else:
2582
+ # Index exists - catch up on any missed changes since watcher was off
2583
+ print("📋 Catching up on changes since last index...", file=sys.stderr)
2584
+ await store.index_codebase(force=False)
2585
+ print("✅ Index updated, starting file watcher", file=sys.stderr)
2586
+
2587
+ except Exception as e:
2588
+ # Failed to index - log and create watcher anyway (it will index on file changes)
2589
+ logger.warning(f"Failed to index before starting watcher: {e}")
2590
+ print(f"⚠️ Warning: Could not index project: {e}", file=sys.stderr)
2591
+ print(
2592
+ "🔄 Starting watcher anyway - will index on first file change", file=sys.stderr
2593
+ )
2594
+
2595
+ watcher = store.start_watching(debounce_seconds=debounce_seconds)
2596
+ _watchers[path_key] = watcher
2597
+ else:
2598
+ watcher = _watchers[path_key]
2599
+ if not watcher.is_running():
2600
+ watcher.start()
2601
+ return _watchers[path_key]
2602
+
2603
+
2604
+ def stop_file_watcher(project_path: str) -> bool:
2605
+ """Stop watching a project directory.
2606
+
2607
+ Args:
2608
+ project_path: Path to the project root
2609
+
2610
+ Returns:
2611
+ True if watcher was stopped, False if no watcher was active
2612
+ """
2613
+ normalized_path = CodebaseVectorStore._normalize_project_path(project_path)
2614
+ path_key = str(normalized_path)
2615
+
2616
+ with _watchers_lock:
2617
+ if path_key in _watchers:
2618
+ watcher = _watchers[path_key]
2619
+ watcher.stop()
2620
+ del _watchers[path_key]
2621
+ return True
2622
+ return False
2623
+
2624
+
2625
+ def get_file_watcher(project_path: str) -> "CodebaseFileWatcher | None":
2626
+ """Get an active file watcher for a project.
2627
+
2628
+ Args:
2629
+ project_path: Path to the project root
2630
+
2631
+ Returns:
2632
+ The CodebaseFileWatcher if active, None otherwise
2633
+ """
2634
+ normalized_path = CodebaseVectorStore._normalize_project_path(project_path)
2635
+ path_key = str(normalized_path)
2636
+
2637
+ with _watchers_lock:
2638
+ watcher = _watchers.get(path_key)
2639
+ if watcher is not None and watcher.is_running():
2640
+ return watcher
2641
+ return None
2642
+
2643
+
2644
+ def list_file_watchers() -> list[dict]:
2645
+ """List all active file watchers.
2646
+
2647
+ Returns:
2648
+ List of dicts with watcher info (project_path, debounce_seconds, provider, status)
2649
+ """
2650
+ with _watchers_lock:
2651
+ watchers_info = []
2652
+ for path, watcher in _watchers.items():
2653
+ watchers_info.append(
2654
+ {
2655
+ "project_path": path,
2656
+ "debounce_seconds": watcher.debounce_seconds,
2657
+ "provider": watcher.store.provider_name,
2658
+ "status": "running" if watcher.is_running() else "stopped",
2659
+ }
2660
+ )
2661
+ return watchers_info
2662
+
2663
+
2664
+ # ========================
2665
+ # MULTI-QUERY EXPANSION & DECOMPOSITION
2666
+ # ========================
2667
+
2668
+
2669
+ async def _expand_query_with_llm(query: str, num_variations: int = 3) -> list[str]:
2670
+ """
2671
+ Use LLM to rephrase a query into multiple semantic variations.
2672
+
2673
+ For example: "database connection" -> ["SQLAlchemy engine setup",
2674
+ "connect to postgres", "db session management"]
2675
+
2676
+ Args:
2677
+ query: Original search query
2678
+ num_variations: Number of variations to generate (default: 3)
2679
+
2680
+ Returns:
2681
+ List of query variations including the original
2682
+ """
2683
+ from mcp_bridge.tools.model_invoke import invoke_gemini
2684
+
2685
+ prompt = f"""You are a code search query expander. Given a search query, generate {num_variations} alternative phrasings that would help find relevant code.
2686
+
2687
+ Original query: "{query}"
2688
+
2689
+ Generate {num_variations} alternative queries that:
2690
+ 1. Use different technical terminology (e.g., "database" -> "SQLAlchemy", "ORM", "connection pool")
2691
+ 2. Reference specific implementations or patterns
2692
+ 3. Include related concepts that might appear in code
2693
+
2694
+ Return ONLY the alternative queries, one per line. No numbering, no explanations.
2695
+ Example output for "database connection":
2696
+ SQLAlchemy engine configuration
2697
+ postgres connection setup
2698
+ db session factory pattern"""
2699
+
2700
+ try:
2701
+ result = await invoke_gemini(
2702
+ token_store=TokenStore(),
2703
+ prompt=prompt,
2704
+ model="gemini-3-flash",
2705
+ temperature=0.7,
2706
+ max_tokens=200,
2707
+ )
2708
+
2709
+ # Parse variations from response
2710
+ variations = [line.strip() for line in result.strip().split("\n") if line.strip()]
2711
+ # Always include original query first
2712
+ all_queries = [query] + variations[:num_variations]
2713
+ return all_queries
2714
+
2715
+ except Exception as e:
2716
+ logger.warning(f"Query expansion failed: {e}, using original query only")
2717
+ return [query]
2718
+
2719
+
2720
+ async def _decompose_query_with_llm(query: str) -> list[str]:
2721
+ """
2722
+ Break a complex query into smaller, focused sub-questions.
2723
+
2724
+ For example: "Initialize the DB and then create a user model" ->
2725
+ ["database initialization", "user model definition"]
2726
+
2727
+ Args:
2728
+ query: Complex search query
2729
+
2730
+ Returns:
2731
+ List of sub-queries, or [query] if decomposition not needed
2732
+ """
2733
+ from mcp_bridge.tools.model_invoke import invoke_gemini
2734
+
2735
+ prompt = f"""You are a code search query analyzer. Determine if this query should be broken into sub-queries.
2736
+
2737
+ Query: "{query}"
2738
+
2739
+ If the query contains multiple distinct concepts (connected by "and", "then", "also", etc.),
2740
+ break it into separate focused sub-queries.
2741
+
2742
+ If the query is already focused on a single concept, return just that query.
2743
+
2744
+ Return ONLY the sub-queries, one per line. No numbering, no explanations.
2745
+
2746
+ Examples:
2747
+ - "Initialize the DB and then create a user model" ->
2748
+ database initialization
2749
+ user model definition
2750
+
2751
+ - "authentication logic" ->
2752
+ authentication logic"""
2753
+
2754
+ try:
2755
+ result = await invoke_gemini(
2756
+ token_store=TokenStore(),
2757
+ prompt=prompt,
2758
+ model="gemini-3-flash",
2759
+ temperature=0.3, # Lower temperature for more consistent decomposition
2760
+ max_tokens=150,
2761
+ )
2762
+
2763
+ # Parse sub-queries from response
2764
+ sub_queries = [line.strip() for line in result.strip().split("\n") if line.strip()]
2765
+ return sub_queries if sub_queries else [query]
2766
+
2767
+ except Exception as e:
2768
+ logger.warning(f"Query decomposition failed: {e}, using original query")
2769
+ return [query]
2770
+
2771
+
2772
+ def _aggregate_results(
2773
+ all_results: list[list[dict]],
2774
+ n_results: int = 10,
2775
+ ) -> list[dict]:
2776
+ """
2777
+ Aggregate and deduplicate results from multiple queries.
2778
+
2779
+ Uses reciprocal rank fusion to combine relevance scores from different queries.
2780
+
2781
+ Args:
2782
+ all_results: List of result lists from different queries
2783
+ n_results: Maximum number of results to return
2784
+
2785
+ Returns:
2786
+ Deduplicated and re-ranked results
2787
+ """
2788
+ # Track seen files to avoid duplicates
2789
+ seen_files: dict[str, dict] = {} # file:lines -> result with best score
2790
+ file_scores: dict[str, float] = {} # file:lines -> aggregated score
2791
+
2792
+ # Reciprocal Rank Fusion constant
2793
+ k = 60
2794
+
2795
+ for _query_idx, results in enumerate(all_results):
2796
+ for rank, result in enumerate(results):
2797
+ file_key = f"{result.get('file', '')}:{result.get('lines', '')}"
2798
+
2799
+ # RRF score contribution
2800
+ rrf_score = 1 / (k + rank + 1)
2801
+
2802
+ if file_key not in seen_files:
2803
+ seen_files[file_key] = result.copy()
2804
+ file_scores[file_key] = rrf_score
2805
+ else:
2806
+ # Aggregate scores
2807
+ file_scores[file_key] += rrf_score
2808
+ # Keep higher original relevance if available
2809
+ if result.get("relevance", 0) > seen_files[file_key].get("relevance", 0):
2810
+ seen_files[file_key] = result.copy()
2811
+
2812
+ # Sort by aggregated score and return top N
2813
+ sorted_keys = sorted(file_scores.keys(), key=lambda k: file_scores[k], reverse=True)
2814
+
2815
+ aggregated = []
2816
+ for key in sorted_keys[:n_results]:
2817
+ result = seen_files[key]
2818
+ # Update relevance to reflect aggregated score (normalized)
2819
+ max_score = max(file_scores.values()) if file_scores else 1
2820
+ result["relevance"] = round(file_scores[key] / max_score, 3)
2821
+ aggregated.append(result)
2822
+
2823
+ return aggregated
2824
+
2825
+
2826
+ async def multi_query_search(
2827
+ query: str,
2828
+ project_path: str = ".",
2829
+ n_results: int = 10,
2830
+ num_expansions: int = 3,
2831
+ language: str | None = None,
2832
+ node_type: str | None = None,
2833
+ provider: EmbeddingProvider = "ollama",
2834
+ ) -> str:
2835
+ """
2836
+ Search with LLM-expanded query variations for better recall.
2837
+
2838
+ Rephrases the query into multiple semantic variations, searches for each,
2839
+ and aggregates results using reciprocal rank fusion.
2840
+
2841
+ Args:
2842
+ query: Natural language search query
2843
+ project_path: Path to the project root
2844
+ n_results: Maximum number of results to return
2845
+ num_expansions: Number of query variations to generate (default: 3)
2846
+ language: Filter by language (e.g., "py", "ts")
2847
+ node_type: Filter by node type (e.g., "function", "class")
2848
+ provider: Embedding provider
2849
+
2850
+ Returns:
2851
+ Formatted search results with relevance scores.
2852
+ """
2853
+ import asyncio
2854
+
2855
+ print(f"🔍 MULTI-QUERY: Expanding '{query[:50]}...'", file=sys.stderr)
2856
+
2857
+ # Get query expansions
2858
+ expanded_queries = await _expand_query_with_llm(query, num_expansions)
2859
+ print(f" Generated {len(expanded_queries)} query variations", file=sys.stderr)
2860
+
2861
+ # Get store once
2862
+ store = get_store(project_path, provider)
2863
+
2864
+ # Search with all queries in parallel
2865
+ async def search_single(q: str) -> list[dict]:
2866
+ return await store.search(
2867
+ q,
2868
+ n_results=n_results, # Get full results for each query
2869
+ language=language,
2870
+ node_type=node_type,
2871
+ )
2872
+
2873
+ all_results = await asyncio.gather(*[search_single(q) for q in expanded_queries])
2874
+
2875
+ # Filter out error results
2876
+ valid_results = [r for r in all_results if r and "error" not in r[0]]
2877
+
2878
+ if not valid_results:
2879
+ if all_results and all_results[0] and "error" in all_results[0][0]:
2880
+ return f"Error: {all_results[0][0]['error']}"
2881
+ return "No results found"
2882
+
2883
+ # Aggregate results
2884
+ aggregated = _aggregate_results(valid_results, n_results)
2885
+
2886
+ if not aggregated:
2887
+ return "No results found"
2888
+
2889
+ # Format output
2890
+ lines = [f"Found {len(aggregated)} results for multi-query expansion of: '{query}'"]
2891
+ lines.append(
2892
+ f"[Expanded to: {', '.join(q[:30] + '...' if len(q) > 30 else q for q in expanded_queries)}]\n"
2893
+ )
2894
+
2895
+ for i, r in enumerate(aggregated, 1):
2896
+ lines.append(f"{i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
2897
+ lines.append(f"```{r.get('language', '')}")
2898
+ lines.append(r.get("code_preview", ""))
2899
+ lines.append("```\n")
2900
+
2901
+ return "\n".join(lines)
2902
+
2903
+
2904
+ async def decomposed_search(
2905
+ query: str,
2906
+ project_path: str = ".",
2907
+ n_results: int = 10,
2908
+ language: str | None = None,
2909
+ node_type: str | None = None,
2910
+ provider: EmbeddingProvider = "ollama",
2911
+ ) -> str:
2912
+ """
2913
+ Search by decomposing complex queries into focused sub-questions.
2914
+
2915
+ Breaks multi-part queries like "Initialize the DB and create a user model"
2916
+ into separate searches, returning organized results for each part.
2917
+
2918
+ Args:
2919
+ query: Complex search query (may contain multiple concepts)
2920
+ project_path: Path to the project root
2921
+ n_results: Maximum results per sub-query
2922
+ language: Filter by language
2923
+ node_type: Filter by node type
2924
+ provider: Embedding provider
2925
+
2926
+ Returns:
2927
+ Formatted results organized by sub-question.
2928
+ """
2929
+ import asyncio
2930
+
2931
+ print(f"🔍 DECOMPOSED-SEARCH: Analyzing '{query[:50]}...'", file=sys.stderr)
2932
+
2933
+ # Decompose query
2934
+ sub_queries = await _decompose_query_with_llm(query)
2935
+ print(f" Decomposed into {len(sub_queries)} sub-queries", file=sys.stderr)
2936
+
2937
+ if len(sub_queries) == 1 and sub_queries[0] == query:
2938
+ # No decomposition needed, use regular search
2939
+ return await semantic_search(
2940
+ query=query,
2941
+ project_path=project_path,
2942
+ n_results=n_results,
2943
+ language=language,
2944
+ node_type=node_type,
2945
+ provider=provider,
2946
+ )
2947
+
2948
+ # Get store once
2949
+ store = get_store(project_path, provider)
2950
+
2951
+ # Search each sub-query in parallel
2952
+ async def search_sub(q: str) -> tuple[str, list[dict]]:
2953
+ results = await store.search(
2954
+ q,
2955
+ n_results=n_results // len(sub_queries) + 2, # Distribute results
2956
+ language=language,
2957
+ node_type=node_type,
2958
+ )
2959
+ return (q, results)
2960
+
2961
+ sub_results = await asyncio.gather(*[search_sub(q) for q in sub_queries])
2962
+
2963
+ # Format output with sections for each sub-query
2964
+ lines = [f"Decomposed search for: '{query}'"]
2965
+ lines.append(f"[Split into {len(sub_queries)} sub-queries]\n")
2966
+
2967
+ total_results = 0
2968
+ for sub_query, results in sub_results:
2969
+ lines.append(f"### {sub_query}")
2970
+
2971
+ if not results or (results and "error" in results[0]):
2972
+ lines.append(" No results found\n")
2973
+ continue
2974
+
2975
+ for i, r in enumerate(results[:5], 1): # Limit per sub-query
2976
+ lines.append(f" {i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
2977
+ # Shorter preview for decomposed results
2978
+ preview = r.get("code_preview", "")[:200]
2979
+ if len(r.get("code_preview", "")) > 200:
2980
+ preview += "..."
2981
+ lines.append(f" ```{r.get('language', '')}")
2982
+ lines.append(f" {preview}")
2983
+ lines.append(" ```")
2984
+ total_results += 1
2985
+ lines.append("")
2986
+
2987
+ lines.append(f"[Total: {total_results} results across {len(sub_queries)} sub-queries]")
2988
+
2989
+ return "\n".join(lines)
2990
+
2991
+
2992
+ async def enhanced_search(
2993
+ query: str,
2994
+ project_path: str = ".",
2995
+ n_results: int = 10,
2996
+ mode: str = "auto",
2997
+ language: str | None = None,
2998
+ node_type: str | None = None,
2999
+ provider: EmbeddingProvider = "ollama",
3000
+ ) -> str:
3001
+ """
3002
+ Unified enhanced search combining expansion and decomposition.
3003
+
3004
+ Automatically selects the best strategy based on query complexity:
3005
+ - Simple queries: Multi-query expansion for better recall
3006
+ - Complex queries: Decomposition + expansion for comprehensive coverage
3007
+
3008
+ Args:
3009
+ query: Search query (simple or complex)
3010
+ project_path: Path to the project root
3011
+ n_results: Maximum number of results
3012
+ mode: Search mode - "auto", "expand", "decompose", or "both"
3013
+ language: Filter by language
3014
+ node_type: Filter by node type
3015
+ provider: Embedding provider
3016
+
3017
+ Returns:
3018
+ Formatted search results.
3019
+ """
3020
+ # Use classifier for intelligent mode selection
3021
+ classification = classify_query(query)
3022
+ logger.debug(
3023
+ f"Query classified as {classification.category.value} "
3024
+ f"(confidence: {classification.confidence:.2f}, suggested: {classification.suggested_tool})"
3025
+ )
3026
+
3027
+ # Determine mode based on classification
3028
+ if mode == "auto":
3029
+ # HYBRID → decompose (complex multi-part queries)
3030
+ # SEMANTIC → expand (conceptual queries benefit from variations)
3031
+ # PATTERN/STRUCTURAL → expand (simple queries, quick path)
3032
+ mode = "decompose" if classification.category == QueryCategory.HYBRID else "expand"
3033
+
3034
+ if mode == "decompose":
3035
+ return await decomposed_search(
3036
+ query=query,
3037
+ project_path=project_path,
3038
+ n_results=n_results,
3039
+ language=language,
3040
+ node_type=node_type,
3041
+ provider=provider,
3042
+ )
3043
+ elif mode == "expand":
3044
+ return await multi_query_search(
3045
+ query=query,
3046
+ project_path=project_path,
3047
+ n_results=n_results,
3048
+ language=language,
3049
+ node_type=node_type,
3050
+ provider=provider,
3051
+ )
3052
+ elif mode == "both":
3053
+ # Decompose first, then expand each sub-query
3054
+ sub_queries = await _decompose_query_with_llm(query)
3055
+
3056
+ all_results: list[list[dict]] = []
3057
+ store = get_store(project_path, provider)
3058
+
3059
+ for sub_q in sub_queries:
3060
+ # Expand each sub-query
3061
+ expanded = await _expand_query_with_llm(sub_q, num_variations=2)
3062
+ for exp_q in expanded:
3063
+ results = await store.search(
3064
+ exp_q,
3065
+ n_results=5,
3066
+ language=language,
3067
+ node_type=node_type,
3068
+ )
3069
+ if results and "error" not in results[0]:
3070
+ all_results.append(results)
3071
+
3072
+ aggregated = _aggregate_results(all_results, n_results)
3073
+
3074
+ if not aggregated:
3075
+ return "No results found"
3076
+
3077
+ lines = [f"Enhanced search (decompose+expand) for: '{query}'"]
3078
+ lines.append(f"[{len(sub_queries)} sub-queries × expansions]\n")
3079
+
3080
+ for i, r in enumerate(aggregated, 1):
3081
+ lines.append(f"{i}. {r['file']}:{r['lines']} (relevance: {r['relevance']})")
3082
+ lines.append(f"```{r.get('language', '')}")
3083
+ lines.append(r.get("code_preview", ""))
3084
+ lines.append("```\n")
3085
+
3086
+ return "\n".join(lines)
3087
+
3088
+ else:
3089
+ return f"Unknown mode: {mode}. Use 'auto', 'expand', 'decompose', or 'both'"
3090
+
3091
+
3092
+ # ========================
3093
+ # FILE WATCHER IMPLEMENTATION
3094
+ # ========================
3095
+
3096
+
3097
+ class DedicatedIndexingWorker:
3098
+ """Single-threaded worker for all indexing operations.
3099
+
3100
+ Prevents concurrent indexing by serializing all operations through a queue.
3101
+ Uses asyncio.run() for each operation to avoid event loop reuse issues.
3102
+ """
3103
+
3104
+ def __init__(self, store: "CodebaseVectorStore"):
3105
+ """Initialize the indexing worker.
3106
+
3107
+ Args:
3108
+ store: CodebaseVectorStore instance for reindexing
3109
+ """
3110
+ import queue
3111
+
3112
+ self.store = store
3113
+ self._queue: queue.Queue = queue.Queue(maxsize=1) # Max 1 pending request (debouncing)
3114
+ self._thread: threading.Thread | None = None
3115
+ self._shutdown = threading.Event()
3116
+ self._log_file = Path.home() / ".stravinsky" / "logs" / "file_watcher.log"
3117
+ self._log_file.parent.mkdir(parents=True, exist_ok=True)
3118
+
3119
+ def start(self) -> None:
3120
+ """Start the worker thread."""
3121
+ if self._thread is not None and self._thread.is_alive():
3122
+ logger.warning("Indexing worker already running")
3123
+ return
3124
+
3125
+ self._shutdown.clear()
3126
+ self._thread = threading.Thread(
3127
+ target=self._run_worker, daemon=False, name="IndexingWorker"
3128
+ )
3129
+ self._thread.start()
3130
+ logger.info(f"Started indexing worker for {self.store.project_path}")
3131
+
3132
+ def _log_error(self, msg: str, exc: Exception | None = None):
3133
+ """Write error to log file with timestamp and full traceback."""
3134
+ import traceback
3135
+ from datetime import datetime
3136
+
3137
+ timestamp = datetime.now().isoformat()
3138
+ try:
3139
+ with open(self._log_file, "a") as f:
3140
+ f.write(f"\n{'=' * 80}\n")
3141
+ f.write(f"[{timestamp}] {msg}\n")
3142
+ if exc:
3143
+ f.write(f"Exception: {type(exc).__name__}: {exc}\n")
3144
+ f.write(traceback.format_exc())
3145
+ f.write(f"{'=' * 80}\n")
3146
+ except Exception as log_exc:
3147
+ logger.error(f"Failed to write to log file: {log_exc}")
3148
+ logger.error(f"{msg} (logged to {self._log_file})")
3149
+
3150
+ def _run_worker(self) -> None:
3151
+ """Worker thread entry point - processes queue with asyncio.run() per operation."""
3152
+ import queue
3153
+
3154
+ self._log_error(f"🟢 File watcher started for {self.store.project_path}")
3155
+
3156
+ try:
3157
+ while not self._shutdown.is_set():
3158
+ try:
3159
+ # Wait for reindex request (blocking with timeout)
3160
+ self._queue.get(timeout=0.5)
3161
+ self._queue.task_done()
3162
+
3163
+ # Use asyncio.run() for each operation (creates fresh loop)
3164
+ # This avoids "event loop already running" errors
3165
+ try:
3166
+ asyncio.run(self._do_reindex())
3167
+ self._log_error(f"✅ Reindex completed for {self.store.project_path}")
3168
+ except Exception as e:
3169
+ self._log_error(f"⚠️ Reindex failed for {self.store.project_path}", e)
3170
+
3171
+ except queue.Empty:
3172
+ continue # No work, check shutdown flag
3173
+ except Exception as e:
3174
+ self._log_error(f"⚠️ Queue processing error for {self.store.project_path}", e)
3175
+
3176
+ except Exception as e:
3177
+ self._log_error(f"⚠️ Worker thread crashed for {self.store.project_path}", e)
3178
+ finally:
3179
+ self._log_error(f"🔴 File watcher stopped for {self.store.project_path}")
3180
+
3181
+ async def _do_reindex(self) -> None:
3182
+ """Execute reindex with retry logic for ALL error types."""
3183
+ import sqlite3
3184
+
3185
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
3186
+
3187
+ @retry(
3188
+ stop=stop_after_attempt(3),
3189
+ wait=wait_exponential(multiplier=1, min=2, max=10),
3190
+ retry=retry_if_exception_type(
3191
+ (
3192
+ httpx.HTTPError,
3193
+ ConnectionError,
3194
+ TimeoutError,
3195
+ sqlite3.OperationalError, # Database locked
3196
+ OSError, # File system errors
3197
+ )
3198
+ ),
3199
+ reraise=True,
3200
+ )
3201
+ async def _indexed():
3202
+ await self.store.index_codebase(force=False)
3203
+
3204
+ await _indexed()
3205
+
3206
+ def request_reindex(self, files: list[Path]) -> None:
3207
+ """Request reindex from any thread (thread-safe).
3208
+
3209
+ Args:
3210
+ files: List of files that changed (for logging only)
3211
+ """
3212
+ import queue
3213
+
3214
+ try:
3215
+ # Non-blocking put - drops if queue full (natural debouncing)
3216
+ self._queue.put_nowait("reindex")
3217
+ logger.debug(f"📥 Queued reindex for {len(files)} files: {[f.name for f in files[:5]]}")
3218
+ except queue.Full:
3219
+ # Already have pending reindex - this is fine (debouncing)
3220
+ logger.debug(f"Reindex already queued, skipping {len(files)} files")
3221
+
3222
+ def shutdown(self) -> None:
3223
+ """Graceful shutdown of worker thread."""
3224
+ if self._shutdown.is_set():
3225
+ return # Already shutting down
3226
+
3227
+ self._shutdown.set()
3228
+ if self._thread is not None and self._thread.is_alive():
3229
+ self._thread.join(timeout=10) # Wait up to 10 seconds
3230
+ if self._thread.is_alive():
3231
+ self._log_error("⚠️ Worker thread failed to stop within timeout")
3232
+ self._thread = None
3233
+ logger.info("Indexing worker shut down")
3234
+
3235
+
3236
+ class CodebaseFileWatcher:
3237
+ """Watch a project directory for file changes and trigger reindexing.
3238
+
3239
+ Features:
3240
+ - Watches for file create, modify, delete, move events
3241
+ - Filters to .py files only
3242
+ - Skips hidden files and directories (., .git, __pycache__, venv, etc.)
3243
+ - Debounces rapid changes to batch them into a single reindex
3244
+ - Thread-safe with daemon threads for clean shutdown
3245
+ - Integrates with CodebaseVectorStore for incremental indexing
3246
+ - Uses dedicated worker thread to prevent concurrent indexing
3247
+ """
3248
+
3249
+ # Default debounce time in seconds
3250
+ DEFAULT_DEBOUNCE_SECONDS = 2.0
3251
+
3252
+ def __init__(
3253
+ self,
3254
+ project_path: Path | str,
3255
+ store: CodebaseVectorStore,
3256
+ debounce_seconds: float = DEFAULT_DEBOUNCE_SECONDS,
3257
+ ):
3258
+ """Initialize the file watcher.
3259
+
3260
+ Args:
3261
+ project_path: Path to the project root to watch
3262
+ store: CodebaseVectorStore instance for reindexing
3263
+ debounce_seconds: Time to wait before reindexing after changes (default: 2.0s)
3264
+ """
3265
+ self.project_path = Path(project_path).resolve()
3266
+ self.store = store
3267
+ self.debounce_seconds = debounce_seconds
3268
+
3269
+ # Observer and handler for watchdog
3270
+ self._observer = None
3271
+ self._event_handler = None
3272
+
3273
+ # Native watcher
3274
+ self._native_watcher: NativeFileWatcher | None = None
3275
+
3276
+ # Thread safety
3277
+ self._lock = threading.Lock()
3278
+ self._running = False
3279
+
3280
+ # Debouncing
3281
+ self._pending_reindex_timer: threading.Timer | None = None
3282
+ self._pending_files: set[Path] = set()
3283
+ self._pending_lock = threading.Lock()
3284
+
3285
+ # Dedicated indexing worker (prevents concurrent access)
3286
+ self._indexing_worker = DedicatedIndexingWorker(store)
3287
+
3288
+ def start(self) -> None:
3289
+ """Start watching the project directory.
3290
+
3291
+ Creates and starts a watchdog observer in a daemon thread.
3292
+ Also starts the dedicated indexing worker thread.
3293
+ """
3294
+ with self._lock:
3295
+ if self._running:
3296
+ logger.warning(f"Watcher for {self.project_path} is already running")
3297
+ return
3298
+
3299
+ try:
3300
+ # Start indexing worker first (must be running before file events arrive)
3301
+ self._indexing_worker.start()
3302
+
3303
+ # Try native watcher first
3304
+ try:
3305
+ self._native_watcher = NativeFileWatcher(
3306
+ str(self.project_path),
3307
+ on_change=lambda type, path: self._on_file_changed(Path(path))
3308
+ )
3309
+ self._native_watcher.start()
3310
+ self._running = True
3311
+ logger.info(f"Native file watcher started for {self.project_path}")
3312
+ return
3313
+ except (FileNotFoundError, Exception) as e:
3314
+ logger.info(f"Native watcher not available, falling back to watchdog: {e}")
3315
+ self._native_watcher = None
3316
+
3317
+ watchdog = get_watchdog()
3318
+ Observer = watchdog["Observer"]
3319
+
3320
+ # Create event handler class and instantiate
3321
+ FileChangeHandler = _create_file_change_handler_class()
3322
+ self._event_handler = FileChangeHandler(
3323
+ project_path=self.project_path,
3324
+ watcher=self,
3325
+ )
3326
+
3327
+ # Create and start observer (daemon mode for clean shutdown)
3328
+ self._observer = Observer()
3329
+ self._observer.daemon = True
3330
+ self._observer.schedule(
3331
+ self._event_handler,
3332
+ str(self.project_path),
3333
+ recursive=True,
3334
+ )
3335
+ self._observer.start()
3336
+ self._running = True
3337
+ logger.info(f"File watcher started for {self.project_path}")
3338
+
3339
+ except Exception as e:
3340
+ logger.error(f"Failed to start file watcher: {e}")
3341
+ self._running = False
3342
+ # Clean up worker if observer failed
3343
+ self._indexing_worker.shutdown()
3344
+ raise
3345
+
3346
+ def stop(self) -> None:
3347
+ """Stop watching the project directory.
3348
+
3349
+ Cancels any pending reindex timers, stops the observer, and shuts down the indexing worker.
3350
+ """
3351
+ with self._lock:
3352
+ # Cancel pending reindex timer
3353
+ with self._pending_lock:
3354
+ if self._pending_reindex_timer:
3355
+ self._pending_reindex_timer.cancel()
3356
+ self._pending_reindex_timer = None
3357
+ self._pending_files.clear()
3358
+
3359
+ # Stop native watcher
3360
+ if self._native_watcher:
3361
+ self._native_watcher.stop()
3362
+ self._native_watcher = None
3363
+
3364
+ # Stop observer
3365
+ if self._observer:
3366
+ self._observer.stop()
3367
+ self._observer.join(timeout=5) # Wait up to 5 seconds for shutdown
3368
+ self._observer = None
3369
+
3370
+ # Shutdown indexing worker
3371
+ self._indexing_worker.shutdown()
3372
+
3373
+ self._event_handler = None
3374
+ self._running = False
3375
+ logger.info(f"File watcher stopped for {self.project_path}")
3376
+
3377
+ def is_running(self) -> bool:
3378
+ """Check if the watcher is currently running.
3379
+
3380
+ Returns:
3381
+ True if watcher is active, False otherwise
3382
+ """
3383
+ with self._lock:
3384
+ return self._running and self._observer is not None and self._observer.is_alive()
3385
+
3386
+ def _on_file_changed(self, file_path: Path) -> None:
3387
+ """Called when a file changes (internal use by _FileChangeHandler).
3388
+
3389
+ Accumulates files and triggers debounced reindex.
3390
+
3391
+ Args:
3392
+ file_path: Path to the changed file
3393
+ """
3394
+ with self._pending_lock:
3395
+ self._pending_files.add(file_path)
3396
+
3397
+ # Cancel previous timer
3398
+ if self._pending_reindex_timer is not None:
3399
+ self._pending_reindex_timer.cancel()
3400
+
3401
+ # Start new timer
3402
+ self._pending_reindex_timer = self._create_debounce_timer()
3403
+ self._pending_reindex_timer.start()
3404
+
3405
+ def _create_debounce_timer(self) -> threading.Timer:
3406
+ """Create a new debounce timer for reindexing.
3407
+
3408
+ Returns:
3409
+ A threading.Timer configured for debounce reindexing
3410
+ """
3411
+ return threading.Timer(
3412
+ self.debounce_seconds,
3413
+ self._trigger_reindex,
3414
+ )
3415
+
3416
+ def _trigger_reindex(self) -> None:
3417
+ """Trigger reindexing of accumulated changed files.
3418
+
3419
+ This is called after the debounce period expires. Delegates to the
3420
+ dedicated indexing worker to prevent concurrent access.
3421
+ """
3422
+ with self._pending_lock:
3423
+ if not self._pending_files:
3424
+ self._pending_reindex_timer = None
3425
+ return
3426
+
3427
+ files_to_index = list(self._pending_files)
3428
+ self._pending_files.clear()
3429
+ self._pending_reindex_timer = None
3430
+
3431
+ # Delegate to dedicated worker (prevents concurrent indexing)
3432
+ self._indexing_worker.request_reindex(files_to_index)
3433
+
3434
+
3435
+ def _create_file_change_handler_class():
3436
+ """Create FileChangeHandler class that inherits from FileSystemEventHandler.
3437
+
3438
+ This is a factory function that creates the handler class dynamically
3439
+ after watchdog is imported, allowing for lazy loading.
3440
+ """
3441
+ watchdog = get_watchdog()
3442
+ FileSystemEventHandler = watchdog["FileSystemEventHandler"]
3443
+
3444
+ class _FileChangeHandler(FileSystemEventHandler):
3445
+ """Watchdog event handler for file system changes.
3446
+
3447
+ Detects file create, modify, delete, and move events, filters them,
3448
+ and notifies the watcher of relevant changes.
3449
+ """
3450
+
3451
+ def __init__(self, project_path: Path, watcher: CodebaseFileWatcher):
3452
+ """Initialize the event handler.
3453
+
3454
+ Args:
3455
+ project_path: Root path of the project being watched
3456
+ watcher: CodebaseFileWatcher instance to notify
3457
+ """
3458
+ super().__init__()
3459
+ self.project_path = project_path
3460
+ self.watcher = watcher
3461
+
3462
+ def on_created(self, event) -> None:
3463
+ """Called when a file is created."""
3464
+ if not event.is_directory and self._should_index_file(event.src_path):
3465
+ logger.debug(f"File created: {event.src_path}")
3466
+ self.watcher._on_file_changed(Path(event.src_path))
3467
+
3468
+ def on_modified(self, event) -> None:
3469
+ """Called when a file is modified."""
3470
+ if not event.is_directory and self._should_index_file(event.src_path):
3471
+ logger.debug(f"File modified: {event.src_path}")
3472
+ self.watcher._on_file_changed(Path(event.src_path))
3473
+
3474
+ def on_deleted(self, event) -> None:
3475
+ """Called when a file is deleted."""
3476
+ if not event.is_directory and self._should_index_file(event.src_path):
3477
+ logger.debug(f"File deleted: {event.src_path}")
3478
+ self.watcher._on_file_changed(Path(event.src_path))
3479
+
3480
+ def on_moved(self, event) -> None:
3481
+ """Called when a file is moved."""
3482
+ if not event.is_directory:
3483
+ # Check destination path
3484
+ if self._should_index_file(event.dest_path):
3485
+ logger.debug(f"File moved: {event.src_path} -> {event.dest_path}")
3486
+ self.watcher._on_file_changed(Path(event.dest_path))
3487
+ # Also check source path (for deletion case)
3488
+ elif self._should_index_file(event.src_path):
3489
+ logger.debug(f"File moved out: {event.src_path}")
3490
+ self.watcher._on_file_changed(Path(event.src_path))
3491
+
3492
+ def _should_index_file(self, file_path: str) -> bool:
3493
+ """Check if a file should trigger reindexing.
3494
+
3495
+ Filters based on:
3496
+ - File extension (.py only)
3497
+ - Hidden files and directories (starting with .)
3498
+ - Skip directories (venv, __pycache__, .git, node_modules, etc.)
3499
+
3500
+ Args:
3501
+ file_path: Path to the file to check
3502
+
3503
+ Returns:
3504
+ True if file should trigger reindexing, False otherwise
3505
+ """
3506
+ path = Path(file_path)
3507
+
3508
+ # Only .py files
3509
+ if path.suffix != ".py":
3510
+ return False
3511
+
3512
+ # Skip hidden files
3513
+ if path.name.startswith("."):
3514
+ return False
3515
+
3516
+ # Check for skip directories in the path
3517
+ for part in path.parts:
3518
+ if part.startswith("."): # Hidden directories like .git, .venv
3519
+ return False
3520
+ if part in {"__pycache__", "venv", "env", "node_modules"}:
3521
+ return False
3522
+
3523
+ # File is within project (resolve both paths to handle symlinks)
3524
+ try:
3525
+ path.resolve().relative_to(self.project_path)
3526
+ return True
3527
+ except ValueError:
3528
+ # File is outside project
3529
+ return False
3530
+
3531
+ return _FileChangeHandler
3532
+
3533
+
3534
+ # ========================
3535
+ # CHROMADB LOCK CLEANUP
3536
+ # ========================
3537
+
3538
+
3539
+ def _is_process_alive(pid: int) -> bool:
3540
+ """Check if a process with given PID is currently running.
3541
+
3542
+ Cross-platform process existence check.
3543
+
3544
+ Args:
3545
+ pid: Process ID to check
3546
+
3547
+ Returns:
3548
+ True if process exists, False otherwise
3549
+ """
3550
+ import os
3551
+ import sys
3552
+
3553
+ if sys.platform == "win32":
3554
+ # Windows: Use tasklist command
3555
+ import subprocess
3556
+
3557
+ try:
3558
+ result = subprocess.run(
3559
+ ["tasklist", "/FI", f"PID eq {pid}"], capture_output=True, text=True, timeout=2
3560
+ )
3561
+ return str(pid) in result.stdout
3562
+ except Exception:
3563
+ return False
3564
+ else:
3565
+ # Unix/Linux/macOS: Use os.kill(pid, 0)
3566
+ try:
3567
+ os.kill(pid, 0)
3568
+ return True
3569
+ except OSError:
3570
+ return False
3571
+ except Exception:
3572
+ return False
3573
+
3574
+
3575
+ def cleanup_stale_chromadb_locks() -> int:
3576
+ """Remove stale ChromaDB lock files on MCP server startup.
3577
+
3578
+ Scans all vectordb directories and removes lock files that:
3579
+ 1. Are older than 60 seconds (short grace period for active operations)
3580
+ 2. Don't have an owning process running (if PID can be determined)
3581
+
3582
+ This prevents 'Connection closed' errors from dead process locks.
3583
+
3584
+ Returns:
3585
+ Number of stale locks removed
3586
+ """
3587
+ vectordb_base = Path.home() / ".stravinsky" / "vectordb"
3588
+ if not vectordb_base.exists():
3589
+ return 0 # No vectordb yet, nothing to cleanup
3590
+
3591
+ import time
3592
+
3593
+ removed_count = 0
3594
+
3595
+ for project_dir in vectordb_base.iterdir():
3596
+ if not project_dir.is_dir():
3597
+ continue
3598
+
3599
+ lock_path = project_dir / ".chromadb.lock"
3600
+ if not lock_path.exists():
3601
+ continue
3602
+
3603
+ # Check lock age
3604
+ try:
3605
+ lock_age = time.time() - lock_path.stat().st_mtime
3606
+ except Exception:
3607
+ continue
3608
+
3609
+ # Aggressive cleanup: remove locks older than 60 seconds
3610
+ # This catches recently crashed processes (old 300s was too conservative)
3611
+ is_stale = lock_age > 60
3612
+
3613
+ # TODO: If lock file contains PID, check if process is alive
3614
+ # filelock doesn't write PID by default, but we could enhance this
3615
+
3616
+ if is_stale:
3617
+ try:
3618
+ lock_path.unlink(missing_ok=True)
3619
+ removed_count += 1
3620
+ logger.info(f"Removed stale lock: {lock_path} (age: {lock_age:.0f}s)")
3621
+ except Exception as e:
3622
+ logger.warning(f"Could not remove stale lock {lock_path}: {e}")
3623
+
3624
+ if removed_count > 0:
3625
+ logger.info(f"Startup cleanup: removed {removed_count} stale ChromaDB lock(s)")
3626
+
3627
+ return removed_count