codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,361 @@
1
+ """Raw filesystem grep — search files directly without requiring an index.
2
+
3
+ Provides ripgrep-compatible grep that works on raw files, not just indexed
4
+ chunks. Uses ``ripgrep`` if available on PATH for maximum speed, falling
5
+ back to a pure-Python implementation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import re
12
+ import shutil
13
+ import subprocess
14
+ import json
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from semantic_code_intelligence.config.settings import AppConfig, load_config
20
+ from semantic_code_intelligence.utils.logging import get_logger
21
+
22
+ logger = get_logger("search.grep")
23
+
24
+
25
+ @dataclass
26
+ class GrepMatch:
27
+ """A single grep match."""
28
+
29
+ file_path: str
30
+ line_number: int
31
+ line_content: str
32
+ column: int = 0
33
+ is_context: bool = False # True for -A/-B context lines
34
+
35
+ def to_dict(self) -> dict[str, Any]:
36
+ d: dict[str, Any] = {
37
+ "file_path": self.file_path,
38
+ "line_number": self.line_number,
39
+ "line_content": self.line_content,
40
+ "column": self.column,
41
+ }
42
+ if self.is_context:
43
+ d["is_context"] = True
44
+ return d
45
+
46
+
47
+ @dataclass
48
+ class GrepResult:
49
+ """Results of a grep operation."""
50
+
51
+ pattern: str
52
+ matches: list[GrepMatch]
53
+ files_searched: int
54
+ files_matched: int
55
+ backend: str # "ripgrep" or "python"
56
+
57
+ def to_dict(self) -> dict[str, Any]:
58
+ return {
59
+ "pattern": self.pattern,
60
+ "match_count": len(self.matches),
61
+ "files_searched": self.files_searched,
62
+ "files_matched": self.files_matched,
63
+ "backend": self.backend,
64
+ "matches": [m.to_dict() for m in self.matches],
65
+ }
66
+
67
+
68
+ def _has_ripgrep() -> str | None:
69
+ """Return path to ripgrep binary if available."""
70
+ return shutil.which("rg")
71
+
72
+
73
+ def _ripgrep_search(
74
+ pattern: str,
75
+ root: Path,
76
+ *,
77
+ case_insensitive: bool = True,
78
+ max_results: int = 100,
79
+ file_glob: str | None = None,
80
+ context_before: int = 0,
81
+ context_after: int = 0,
82
+ word_match: bool = False,
83
+ invert_match: bool = False,
84
+ include_hidden: bool = False,
85
+ count_only: bool = False,
86
+ ) -> GrepResult:
87
+ """Run ripgrep and parse JSON output."""
88
+ rg = _has_ripgrep()
89
+ if not rg:
90
+ raise RuntimeError("ripgrep not found")
91
+
92
+ cmd = [rg, "--json", "--max-count", str(max_results)]
93
+ if case_insensitive:
94
+ cmd.append("-i")
95
+ if file_glob:
96
+ cmd.extend(["-g", file_glob])
97
+ if context_before > 0:
98
+ cmd.extend(["-B", str(context_before)])
99
+ if context_after > 0:
100
+ cmd.extend(["-A", str(context_after)])
101
+ if word_match:
102
+ cmd.append("-w")
103
+ if invert_match:
104
+ cmd.append("--invert-match")
105
+ if include_hidden:
106
+ cmd.append("--hidden")
107
+ if count_only:
108
+ cmd.append("--count")
109
+
110
+ cmd.append(pattern)
111
+ cmd.append(str(root))
112
+
113
+ try:
114
+ result = subprocess.run(
115
+ cmd,
116
+ capture_output=True,
117
+ text=True,
118
+ timeout=30,
119
+ cwd=str(root),
120
+ )
121
+ except subprocess.TimeoutExpired:
122
+ logger.warning("ripgrep timed out after 30s")
123
+ return GrepResult(pattern=pattern, matches=[], files_searched=0,
124
+ files_matched=0, backend="ripgrep")
125
+
126
+ matches: list[GrepMatch] = []
127
+ files_matched: set[str] = set()
128
+
129
+ for line in result.stdout.splitlines():
130
+ try:
131
+ data = json.loads(line)
132
+ except json.JSONDecodeError:
133
+ continue
134
+
135
+ dtype = data.get("type")
136
+ if dtype == "match":
137
+ match_data = data["data"]
138
+ path_text = match_data["path"]["text"]
139
+ for submatch in match_data.get("submatches", []):
140
+ matches.append(GrepMatch(
141
+ file_path=path_text,
142
+ line_number=match_data["line_number"],
143
+ line_content=match_data["lines"]["text"].rstrip("\n"),
144
+ column=submatch.get("start", 0),
145
+ ))
146
+ files_matched.add(path_text)
147
+ elif dtype == "context":
148
+ ctx_data = data["data"]
149
+ path_text = ctx_data["path"]["text"]
150
+ matches.append(GrepMatch(
151
+ file_path=path_text,
152
+ line_number=ctx_data["line_number"],
153
+ line_content=ctx_data["lines"]["text"].rstrip("\n"),
154
+ column=0,
155
+ is_context=True,
156
+ ))
157
+
158
+ return GrepResult(
159
+ pattern=pattern,
160
+ matches=matches[:max_results],
161
+ files_searched=0, # ripgrep doesn't report this easily
162
+ files_matched=len(files_matched),
163
+ backend="ripgrep",
164
+ )
165
+
166
+
167
+ def _python_grep(
168
+ pattern: str,
169
+ root: Path,
170
+ *,
171
+ case_insensitive: bool = True,
172
+ max_results: int = 100,
173
+ extensions: set[str] | None = None,
174
+ context_before: int = 0,
175
+ context_after: int = 0,
176
+ word_match: bool = False,
177
+ invert_match: bool = False,
178
+ include_hidden: bool = False,
179
+ ) -> GrepResult:
180
+ """Pure-Python grep fallback over raw files."""
181
+ actual_pattern = rf"\b{pattern}\b" if word_match else pattern
182
+ flags = re.IGNORECASE if case_insensitive else 0
183
+ try:
184
+ compiled = re.compile(actual_pattern, flags)
185
+ except re.error as exc:
186
+ logger.warning("Invalid regex pattern %r: %s", pattern, exc)
187
+ return GrepResult(pattern=pattern, matches=[], files_searched=0,
188
+ files_matched=0, backend="python")
189
+
190
+ if extensions is None:
191
+ try:
192
+ config = load_config(root)
193
+ extensions = set(config.index.extensions)
194
+ except Exception:
195
+ extensions = {".py", ".js", ".ts", ".java", ".go", ".rs", ".rb", ".cpp", ".cs"}
196
+
197
+ matches: list[GrepMatch] = []
198
+ files_searched = 0
199
+ files_matched: set[str] = set()
200
+
201
+ for dirpath, dirnames, filenames in os.walk(root):
202
+ # Skip hidden directories unless include_hidden
203
+ if not include_hidden:
204
+ dirnames[:] = [d for d in dirnames if not d.startswith(".")]
205
+ rel_dir = Path(dirpath).relative_to(root)
206
+ if str(rel_dir).startswith(("node_modules", "__pycache__")):
207
+ continue
208
+ if not include_hidden and str(rel_dir).startswith(".git"):
209
+ continue
210
+
211
+ for fname in filenames:
212
+ if Path(fname).suffix not in extensions:
213
+ continue
214
+
215
+ fpath = Path(dirpath) / fname
216
+ files_searched += 1
217
+
218
+ try:
219
+ content = fpath.read_text(encoding="utf-8", errors="replace")
220
+ except (OSError, PermissionError):
221
+ continue
222
+
223
+ lines = content.splitlines()
224
+ rel_path = str(fpath.relative_to(root))
225
+ file_had_match = False
226
+
227
+ # Collect matching line numbers first
228
+ matching_linenos: set[int] = set()
229
+ for lineno_idx, line in enumerate(lines):
230
+ found = compiled.search(line)
231
+ if (found and not invert_match) or (not found and invert_match):
232
+ matching_linenos.add(lineno_idx)
233
+
234
+ if not matching_linenos:
235
+ continue
236
+
237
+ file_had_match = True
238
+ files_matched.add(rel_path)
239
+
240
+ # Build output with context
241
+ emitted: set[int] = set()
242
+ for lineno_idx in sorted(matching_linenos):
243
+ # Context before
244
+ for ctx_idx in range(max(0, lineno_idx - context_before), lineno_idx):
245
+ if ctx_idx not in emitted:
246
+ emitted.add(ctx_idx)
247
+ matches.append(GrepMatch(
248
+ file_path=rel_path,
249
+ line_number=ctx_idx + 1,
250
+ line_content=lines[ctx_idx],
251
+ column=0,
252
+ is_context=True,
253
+ ))
254
+ # Matching line
255
+ if lineno_idx not in emitted:
256
+ emitted.add(lineno_idx)
257
+ m = compiled.search(lines[lineno_idx])
258
+ matches.append(GrepMatch(
259
+ file_path=rel_path,
260
+ line_number=lineno_idx + 1,
261
+ line_content=lines[lineno_idx],
262
+ column=m.start() if m else 0,
263
+ ))
264
+ # Context after
265
+ for ctx_idx in range(lineno_idx + 1, min(len(lines), lineno_idx + 1 + context_after)):
266
+ if ctx_idx not in emitted:
267
+ emitted.add(ctx_idx)
268
+ matches.append(GrepMatch(
269
+ file_path=rel_path,
270
+ line_number=ctx_idx + 1,
271
+ line_content=lines[ctx_idx],
272
+ column=0,
273
+ is_context=True,
274
+ ))
275
+
276
+ if len(matches) >= max_results:
277
+ return GrepResult(
278
+ pattern=pattern,
279
+ matches=matches[:max_results],
280
+ files_searched=files_searched,
281
+ files_matched=len(files_matched),
282
+ backend="python",
283
+ )
284
+
285
+ return GrepResult(
286
+ pattern=pattern,
287
+ matches=matches,
288
+ files_searched=files_searched,
289
+ files_matched=len(files_matched),
290
+ backend="python",
291
+ )
292
+
293
+
294
+ def grep_search(
295
+ pattern: str,
296
+ root: Path,
297
+ *,
298
+ case_insensitive: bool = True,
299
+ max_results: int = 100,
300
+ use_ripgrep: bool = True,
301
+ file_glob: str | None = None,
302
+ context_before: int = 0,
303
+ context_after: int = 0,
304
+ word_match: bool = False,
305
+ invert_match: bool = False,
306
+ include_hidden: bool = False,
307
+ count_only: bool = False,
308
+ ) -> GrepResult:
309
+ """Search raw files using ripgrep (if available) or Python fallback.
310
+
311
+ Unlike indexed search modes, this searches the actual filesystem
312
+ without requiring an index. Instant results, zero setup.
313
+
314
+ Args:
315
+ pattern: Regex pattern to search for.
316
+ root: Project root to search.
317
+ case_insensitive: Case-insensitive matching.
318
+ max_results: Maximum matches to return.
319
+ use_ripgrep: Try ripgrep first (recommended).
320
+ file_glob: Optional glob to filter files (e.g., "*.py").
321
+ context_before: Lines of context before each match (-B).
322
+ context_after: Lines of context after each match (-A).
323
+ word_match: Match whole words only (-w).
324
+ invert_match: Show non-matching lines (-v).
325
+ include_hidden: Include hidden files/directories.
326
+ count_only: Only return match counts per file (-c).
327
+ """
328
+ if use_ripgrep and _has_ripgrep():
329
+ try:
330
+ return _ripgrep_search(
331
+ pattern, root,
332
+ case_insensitive=case_insensitive,
333
+ max_results=max_results,
334
+ file_glob=file_glob,
335
+ context_before=context_before,
336
+ context_after=context_after,
337
+ word_match=word_match,
338
+ invert_match=invert_match,
339
+ include_hidden=include_hidden,
340
+ count_only=count_only,
341
+ )
342
+ except Exception:
343
+ logger.debug("ripgrep failed, falling back to Python grep")
344
+
345
+ extensions = None
346
+ if file_glob:
347
+ # Convert glob like "*.py" to extension set
348
+ import fnmatch
349
+ extensions = {Path(file_glob.lstrip("*")).suffix} if "." in file_glob else None
350
+
351
+ return _python_grep(
352
+ pattern, root,
353
+ case_insensitive=case_insensitive,
354
+ max_results=max_results,
355
+ extensions=extensions,
356
+ context_before=context_before,
357
+ context_after=context_after,
358
+ word_match=word_match,
359
+ invert_match=invert_match,
360
+ include_hidden=include_hidden,
361
+ )
@@ -0,0 +1,170 @@
1
+ """Hybrid search — fuses semantic (FAISS) and keyword (BM25) results via RRF.
2
+
3
+ Reciprocal Rank Fusion combines two ranked lists into a single list that
4
+ benefits from both semantic understanding and exact keyword matching.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from semantic_code_intelligence.config.settings import AppConfig, load_config
14
+ from semantic_code_intelligence.embeddings.generator import generate_embeddings
15
+ from semantic_code_intelligence.search.keyword_search import (
16
+ BM25Index,
17
+ KeywordResult,
18
+ _get_bm25,
19
+ keyword_search,
20
+ )
21
+ from semantic_code_intelligence.storage.vector_store import VectorStore
22
+ from semantic_code_intelligence.utils.logging import get_logger
23
+
24
+ logger = get_logger("search.hybrid")
25
+
26
+ # Default RRF constant (k=60 is standard in literature)
27
+ RRF_K = 60
28
+
29
+
30
+ @dataclass
31
+ class HybridResult:
32
+ """A search result produced by fusing semantic + keyword rankings."""
33
+
34
+ file_path: str
35
+ start_line: int
36
+ end_line: int
37
+ language: str
38
+ content: str
39
+ score: float # fused RRF score
40
+ semantic_score: float # original cosine similarity (0 if not in semantic)
41
+ keyword_score: float # original BM25 score (0 if not in keyword)
42
+ chunk_index: int
43
+
44
+ def to_dict(self) -> dict[str, Any]:
45
+ return {
46
+ "file_path": self.file_path,
47
+ "start_line": self.start_line,
48
+ "end_line": self.end_line,
49
+ "language": self.language,
50
+ "content": self.content,
51
+ "score": round(self.score, 6),
52
+ "semantic_score": round(self.semantic_score, 4),
53
+ "keyword_score": round(self.keyword_score, 4),
54
+ "chunk_index": self.chunk_index,
55
+ }
56
+
57
+
58
+ def _chunk_key(meta: Any) -> str:
59
+ """Unique key for de-duplicating chunks across result lists."""
60
+ return f"{meta.file_path}:{meta.start_line}:{meta.end_line}"
61
+
62
+
63
+ def reciprocal_rank_fusion(
64
+ semantic_ranking: list[tuple[int, float]],
65
+ keyword_ranking: list[tuple[int, float]],
66
+ k: int = RRF_K,
67
+ ) -> list[tuple[int, float, float, float]]:
68
+ """Fuse two ranked lists via Reciprocal Rank Fusion.
69
+
70
+ Args:
71
+ semantic_ranking: [(chunk_index_in_store, cosine_score), ...] ordered best-first.
72
+ keyword_ranking: [(chunk_index_in_store, bm25_score), ...] ordered best-first.
73
+ k: RRF smoothing constant.
74
+
75
+ Returns:
76
+ [(chunk_index, fused_score, semantic_score, keyword_score), ...]
77
+ sorted by fused_score descending.
78
+ """
79
+ scores: dict[int, float] = {}
80
+ sem_scores: dict[int, float] = {}
81
+ kw_scores: dict[int, float] = {}
82
+
83
+ for rank, (idx, score) in enumerate(semantic_ranking):
84
+ scores[idx] = scores.get(idx, 0.0) + 1.0 / (k + rank + 1)
85
+ sem_scores[idx] = score
86
+
87
+ for rank, (idx, score) in enumerate(keyword_ranking):
88
+ scores[idx] = scores.get(idx, 0.0) + 1.0 / (k + rank + 1)
89
+ kw_scores[idx] = score
90
+
91
+ fused = [
92
+ (idx, fused_score, sem_scores.get(idx, 0.0), kw_scores.get(idx, 0.0))
93
+ for idx, fused_score in scores.items()
94
+ ]
95
+ fused.sort(key=lambda x: x[1], reverse=True)
96
+ return fused
97
+
98
+
99
+ def hybrid_search(
100
+ query: str,
101
+ store: VectorStore,
102
+ index_dir: Path,
103
+ model_name: str = "all-MiniLM-L6-v2",
104
+ top_k: int = 10,
105
+ semantic_weight: int | None = None,
106
+ keyword_weight: int | None = None,
107
+ ) -> list[HybridResult]:
108
+ """Execute a hybrid search combining semantic and BM25 keyword results.
109
+
110
+ Args:
111
+ query: Natural language or keyword query.
112
+ store: Loaded VectorStore.
113
+ index_dir: Path to index directory.
114
+ model_name: Embedding model name.
115
+ top_k: Number of final results.
116
+ semantic_weight: How many candidates to pull from semantic (default 2×top_k).
117
+ keyword_weight: How many candidates to pull from keyword (default 2×top_k).
118
+
119
+ Returns:
120
+ List of HybridResult, sorted by fused RRF score.
121
+ """
122
+ if store.size == 0:
123
+ return []
124
+
125
+ candidate_k = top_k * 2
126
+
127
+ # --- Semantic arm ---
128
+ query_embedding = generate_embeddings([query], model_name=model_name)[0]
129
+ sem_raw = store.search(query_embedding, top_k=semantic_weight or candidate_k)
130
+
131
+ # Map (ChunkMetadata, score) → (metadata_index, score)
132
+ # We need the metadata index to identify chunks across both arms
133
+ meta_to_idx: dict[str, int] = {}
134
+ for i, m in enumerate(store.metadata):
135
+ key = _chunk_key(m)
136
+ if key not in meta_to_idx:
137
+ meta_to_idx[key] = i
138
+
139
+ semantic_ranking: list[tuple[int, float]] = []
140
+ for meta, score in sem_raw:
141
+ key = _chunk_key(meta)
142
+ idx = meta_to_idx.get(key, -1)
143
+ if idx >= 0:
144
+ semantic_ranking.append((idx, float(score)))
145
+
146
+ # --- Keyword arm (BM25) ---
147
+ bm25 = _get_bm25(index_dir, store)
148
+ keyword_ranking = bm25.search(query, top_k=keyword_weight or candidate_k)
149
+
150
+ # --- Fusion ---
151
+ fused = reciprocal_rank_fusion(semantic_ranking, keyword_ranking)
152
+
153
+ results: list[HybridResult] = []
154
+ for idx, fused_score, sem_score, kw_score in fused[:top_k]:
155
+ meta = store.metadata[idx]
156
+ results.append(
157
+ HybridResult(
158
+ file_path=meta.file_path,
159
+ start_line=meta.start_line,
160
+ end_line=meta.end_line,
161
+ language=meta.language,
162
+ content=meta.content,
163
+ score=fused_score,
164
+ semantic_score=sem_score,
165
+ keyword_score=kw_score,
166
+ chunk_index=meta.chunk_index,
167
+ )
168
+ )
169
+
170
+ return results