codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""Raw filesystem grep — search files directly without requiring an index.
|
|
2
|
+
|
|
3
|
+
Provides ripgrep-compatible grep that works on raw files, not just indexed
|
|
4
|
+
chunks. Uses ``ripgrep`` if available on PATH for maximum speed, falling
|
|
5
|
+
back to a pure-Python implementation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
import json
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from semantic_code_intelligence.config.settings import AppConfig, load_config
|
|
20
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
21
|
+
|
|
22
|
+
logger = get_logger("search.grep")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class GrepMatch:
|
|
27
|
+
"""A single grep match."""
|
|
28
|
+
|
|
29
|
+
file_path: str
|
|
30
|
+
line_number: int
|
|
31
|
+
line_content: str
|
|
32
|
+
column: int = 0
|
|
33
|
+
is_context: bool = False # True for -A/-B context lines
|
|
34
|
+
|
|
35
|
+
def to_dict(self) -> dict[str, Any]:
|
|
36
|
+
d: dict[str, Any] = {
|
|
37
|
+
"file_path": self.file_path,
|
|
38
|
+
"line_number": self.line_number,
|
|
39
|
+
"line_content": self.line_content,
|
|
40
|
+
"column": self.column,
|
|
41
|
+
}
|
|
42
|
+
if self.is_context:
|
|
43
|
+
d["is_context"] = True
|
|
44
|
+
return d
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class GrepResult:
|
|
49
|
+
"""Results of a grep operation."""
|
|
50
|
+
|
|
51
|
+
pattern: str
|
|
52
|
+
matches: list[GrepMatch]
|
|
53
|
+
files_searched: int
|
|
54
|
+
files_matched: int
|
|
55
|
+
backend: str # "ripgrep" or "python"
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> dict[str, Any]:
|
|
58
|
+
return {
|
|
59
|
+
"pattern": self.pattern,
|
|
60
|
+
"match_count": len(self.matches),
|
|
61
|
+
"files_searched": self.files_searched,
|
|
62
|
+
"files_matched": self.files_matched,
|
|
63
|
+
"backend": self.backend,
|
|
64
|
+
"matches": [m.to_dict() for m in self.matches],
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _has_ripgrep() -> str | None:
|
|
69
|
+
"""Return path to ripgrep binary if available."""
|
|
70
|
+
return shutil.which("rg")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _ripgrep_search(
|
|
74
|
+
pattern: str,
|
|
75
|
+
root: Path,
|
|
76
|
+
*,
|
|
77
|
+
case_insensitive: bool = True,
|
|
78
|
+
max_results: int = 100,
|
|
79
|
+
file_glob: str | None = None,
|
|
80
|
+
context_before: int = 0,
|
|
81
|
+
context_after: int = 0,
|
|
82
|
+
word_match: bool = False,
|
|
83
|
+
invert_match: bool = False,
|
|
84
|
+
include_hidden: bool = False,
|
|
85
|
+
count_only: bool = False,
|
|
86
|
+
) -> GrepResult:
|
|
87
|
+
"""Run ripgrep and parse JSON output."""
|
|
88
|
+
rg = _has_ripgrep()
|
|
89
|
+
if not rg:
|
|
90
|
+
raise RuntimeError("ripgrep not found")
|
|
91
|
+
|
|
92
|
+
cmd = [rg, "--json", "--max-count", str(max_results)]
|
|
93
|
+
if case_insensitive:
|
|
94
|
+
cmd.append("-i")
|
|
95
|
+
if file_glob:
|
|
96
|
+
cmd.extend(["-g", file_glob])
|
|
97
|
+
if context_before > 0:
|
|
98
|
+
cmd.extend(["-B", str(context_before)])
|
|
99
|
+
if context_after > 0:
|
|
100
|
+
cmd.extend(["-A", str(context_after)])
|
|
101
|
+
if word_match:
|
|
102
|
+
cmd.append("-w")
|
|
103
|
+
if invert_match:
|
|
104
|
+
cmd.append("--invert-match")
|
|
105
|
+
if include_hidden:
|
|
106
|
+
cmd.append("--hidden")
|
|
107
|
+
if count_only:
|
|
108
|
+
cmd.append("--count")
|
|
109
|
+
|
|
110
|
+
cmd.append(pattern)
|
|
111
|
+
cmd.append(str(root))
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
result = subprocess.run(
|
|
115
|
+
cmd,
|
|
116
|
+
capture_output=True,
|
|
117
|
+
text=True,
|
|
118
|
+
timeout=30,
|
|
119
|
+
cwd=str(root),
|
|
120
|
+
)
|
|
121
|
+
except subprocess.TimeoutExpired:
|
|
122
|
+
logger.warning("ripgrep timed out after 30s")
|
|
123
|
+
return GrepResult(pattern=pattern, matches=[], files_searched=0,
|
|
124
|
+
files_matched=0, backend="ripgrep")
|
|
125
|
+
|
|
126
|
+
matches: list[GrepMatch] = []
|
|
127
|
+
files_matched: set[str] = set()
|
|
128
|
+
|
|
129
|
+
for line in result.stdout.splitlines():
|
|
130
|
+
try:
|
|
131
|
+
data = json.loads(line)
|
|
132
|
+
except json.JSONDecodeError:
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
dtype = data.get("type")
|
|
136
|
+
if dtype == "match":
|
|
137
|
+
match_data = data["data"]
|
|
138
|
+
path_text = match_data["path"]["text"]
|
|
139
|
+
for submatch in match_data.get("submatches", []):
|
|
140
|
+
matches.append(GrepMatch(
|
|
141
|
+
file_path=path_text,
|
|
142
|
+
line_number=match_data["line_number"],
|
|
143
|
+
line_content=match_data["lines"]["text"].rstrip("\n"),
|
|
144
|
+
column=submatch.get("start", 0),
|
|
145
|
+
))
|
|
146
|
+
files_matched.add(path_text)
|
|
147
|
+
elif dtype == "context":
|
|
148
|
+
ctx_data = data["data"]
|
|
149
|
+
path_text = ctx_data["path"]["text"]
|
|
150
|
+
matches.append(GrepMatch(
|
|
151
|
+
file_path=path_text,
|
|
152
|
+
line_number=ctx_data["line_number"],
|
|
153
|
+
line_content=ctx_data["lines"]["text"].rstrip("\n"),
|
|
154
|
+
column=0,
|
|
155
|
+
is_context=True,
|
|
156
|
+
))
|
|
157
|
+
|
|
158
|
+
return GrepResult(
|
|
159
|
+
pattern=pattern,
|
|
160
|
+
matches=matches[:max_results],
|
|
161
|
+
files_searched=0, # ripgrep doesn't report this easily
|
|
162
|
+
files_matched=len(files_matched),
|
|
163
|
+
backend="ripgrep",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _python_grep(
|
|
168
|
+
pattern: str,
|
|
169
|
+
root: Path,
|
|
170
|
+
*,
|
|
171
|
+
case_insensitive: bool = True,
|
|
172
|
+
max_results: int = 100,
|
|
173
|
+
extensions: set[str] | None = None,
|
|
174
|
+
context_before: int = 0,
|
|
175
|
+
context_after: int = 0,
|
|
176
|
+
word_match: bool = False,
|
|
177
|
+
invert_match: bool = False,
|
|
178
|
+
include_hidden: bool = False,
|
|
179
|
+
) -> GrepResult:
|
|
180
|
+
"""Pure-Python grep fallback over raw files."""
|
|
181
|
+
actual_pattern = rf"\b{pattern}\b" if word_match else pattern
|
|
182
|
+
flags = re.IGNORECASE if case_insensitive else 0
|
|
183
|
+
try:
|
|
184
|
+
compiled = re.compile(actual_pattern, flags)
|
|
185
|
+
except re.error as exc:
|
|
186
|
+
logger.warning("Invalid regex pattern %r: %s", pattern, exc)
|
|
187
|
+
return GrepResult(pattern=pattern, matches=[], files_searched=0,
|
|
188
|
+
files_matched=0, backend="python")
|
|
189
|
+
|
|
190
|
+
if extensions is None:
|
|
191
|
+
try:
|
|
192
|
+
config = load_config(root)
|
|
193
|
+
extensions = set(config.index.extensions)
|
|
194
|
+
except Exception:
|
|
195
|
+
extensions = {".py", ".js", ".ts", ".java", ".go", ".rs", ".rb", ".cpp", ".cs"}
|
|
196
|
+
|
|
197
|
+
matches: list[GrepMatch] = []
|
|
198
|
+
files_searched = 0
|
|
199
|
+
files_matched: set[str] = set()
|
|
200
|
+
|
|
201
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
202
|
+
# Skip hidden directories unless include_hidden
|
|
203
|
+
if not include_hidden:
|
|
204
|
+
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
|
205
|
+
rel_dir = Path(dirpath).relative_to(root)
|
|
206
|
+
if str(rel_dir).startswith(("node_modules", "__pycache__")):
|
|
207
|
+
continue
|
|
208
|
+
if not include_hidden and str(rel_dir).startswith(".git"):
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
for fname in filenames:
|
|
212
|
+
if Path(fname).suffix not in extensions:
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
fpath = Path(dirpath) / fname
|
|
216
|
+
files_searched += 1
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
content = fpath.read_text(encoding="utf-8", errors="replace")
|
|
220
|
+
except (OSError, PermissionError):
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
lines = content.splitlines()
|
|
224
|
+
rel_path = str(fpath.relative_to(root))
|
|
225
|
+
file_had_match = False
|
|
226
|
+
|
|
227
|
+
# Collect matching line numbers first
|
|
228
|
+
matching_linenos: set[int] = set()
|
|
229
|
+
for lineno_idx, line in enumerate(lines):
|
|
230
|
+
found = compiled.search(line)
|
|
231
|
+
if (found and not invert_match) or (not found and invert_match):
|
|
232
|
+
matching_linenos.add(lineno_idx)
|
|
233
|
+
|
|
234
|
+
if not matching_linenos:
|
|
235
|
+
continue
|
|
236
|
+
|
|
237
|
+
file_had_match = True
|
|
238
|
+
files_matched.add(rel_path)
|
|
239
|
+
|
|
240
|
+
# Build output with context
|
|
241
|
+
emitted: set[int] = set()
|
|
242
|
+
for lineno_idx in sorted(matching_linenos):
|
|
243
|
+
# Context before
|
|
244
|
+
for ctx_idx in range(max(0, lineno_idx - context_before), lineno_idx):
|
|
245
|
+
if ctx_idx not in emitted:
|
|
246
|
+
emitted.add(ctx_idx)
|
|
247
|
+
matches.append(GrepMatch(
|
|
248
|
+
file_path=rel_path,
|
|
249
|
+
line_number=ctx_idx + 1,
|
|
250
|
+
line_content=lines[ctx_idx],
|
|
251
|
+
column=0,
|
|
252
|
+
is_context=True,
|
|
253
|
+
))
|
|
254
|
+
# Matching line
|
|
255
|
+
if lineno_idx not in emitted:
|
|
256
|
+
emitted.add(lineno_idx)
|
|
257
|
+
m = compiled.search(lines[lineno_idx])
|
|
258
|
+
matches.append(GrepMatch(
|
|
259
|
+
file_path=rel_path,
|
|
260
|
+
line_number=lineno_idx + 1,
|
|
261
|
+
line_content=lines[lineno_idx],
|
|
262
|
+
column=m.start() if m else 0,
|
|
263
|
+
))
|
|
264
|
+
# Context after
|
|
265
|
+
for ctx_idx in range(lineno_idx + 1, min(len(lines), lineno_idx + 1 + context_after)):
|
|
266
|
+
if ctx_idx not in emitted:
|
|
267
|
+
emitted.add(ctx_idx)
|
|
268
|
+
matches.append(GrepMatch(
|
|
269
|
+
file_path=rel_path,
|
|
270
|
+
line_number=ctx_idx + 1,
|
|
271
|
+
line_content=lines[ctx_idx],
|
|
272
|
+
column=0,
|
|
273
|
+
is_context=True,
|
|
274
|
+
))
|
|
275
|
+
|
|
276
|
+
if len(matches) >= max_results:
|
|
277
|
+
return GrepResult(
|
|
278
|
+
pattern=pattern,
|
|
279
|
+
matches=matches[:max_results],
|
|
280
|
+
files_searched=files_searched,
|
|
281
|
+
files_matched=len(files_matched),
|
|
282
|
+
backend="python",
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
return GrepResult(
|
|
286
|
+
pattern=pattern,
|
|
287
|
+
matches=matches,
|
|
288
|
+
files_searched=files_searched,
|
|
289
|
+
files_matched=len(files_matched),
|
|
290
|
+
backend="python",
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def grep_search(
|
|
295
|
+
pattern: str,
|
|
296
|
+
root: Path,
|
|
297
|
+
*,
|
|
298
|
+
case_insensitive: bool = True,
|
|
299
|
+
max_results: int = 100,
|
|
300
|
+
use_ripgrep: bool = True,
|
|
301
|
+
file_glob: str | None = None,
|
|
302
|
+
context_before: int = 0,
|
|
303
|
+
context_after: int = 0,
|
|
304
|
+
word_match: bool = False,
|
|
305
|
+
invert_match: bool = False,
|
|
306
|
+
include_hidden: bool = False,
|
|
307
|
+
count_only: bool = False,
|
|
308
|
+
) -> GrepResult:
|
|
309
|
+
"""Search raw files using ripgrep (if available) or Python fallback.
|
|
310
|
+
|
|
311
|
+
Unlike indexed search modes, this searches the actual filesystem
|
|
312
|
+
without requiring an index. Instant results, zero setup.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
pattern: Regex pattern to search for.
|
|
316
|
+
root: Project root to search.
|
|
317
|
+
case_insensitive: Case-insensitive matching.
|
|
318
|
+
max_results: Maximum matches to return.
|
|
319
|
+
use_ripgrep: Try ripgrep first (recommended).
|
|
320
|
+
file_glob: Optional glob to filter files (e.g., "*.py").
|
|
321
|
+
context_before: Lines of context before each match (-B).
|
|
322
|
+
context_after: Lines of context after each match (-A).
|
|
323
|
+
word_match: Match whole words only (-w).
|
|
324
|
+
invert_match: Show non-matching lines (-v).
|
|
325
|
+
include_hidden: Include hidden files/directories.
|
|
326
|
+
count_only: Only return match counts per file (-c).
|
|
327
|
+
"""
|
|
328
|
+
if use_ripgrep and _has_ripgrep():
|
|
329
|
+
try:
|
|
330
|
+
return _ripgrep_search(
|
|
331
|
+
pattern, root,
|
|
332
|
+
case_insensitive=case_insensitive,
|
|
333
|
+
max_results=max_results,
|
|
334
|
+
file_glob=file_glob,
|
|
335
|
+
context_before=context_before,
|
|
336
|
+
context_after=context_after,
|
|
337
|
+
word_match=word_match,
|
|
338
|
+
invert_match=invert_match,
|
|
339
|
+
include_hidden=include_hidden,
|
|
340
|
+
count_only=count_only,
|
|
341
|
+
)
|
|
342
|
+
except Exception:
|
|
343
|
+
logger.debug("ripgrep failed, falling back to Python grep")
|
|
344
|
+
|
|
345
|
+
extensions = None
|
|
346
|
+
if file_glob:
|
|
347
|
+
# Convert glob like "*.py" to extension set
|
|
348
|
+
import fnmatch
|
|
349
|
+
extensions = {Path(file_glob.lstrip("*")).suffix} if "." in file_glob else None
|
|
350
|
+
|
|
351
|
+
return _python_grep(
|
|
352
|
+
pattern, root,
|
|
353
|
+
case_insensitive=case_insensitive,
|
|
354
|
+
max_results=max_results,
|
|
355
|
+
extensions=extensions,
|
|
356
|
+
context_before=context_before,
|
|
357
|
+
context_after=context_after,
|
|
358
|
+
word_match=word_match,
|
|
359
|
+
invert_match=invert_match,
|
|
360
|
+
include_hidden=include_hidden,
|
|
361
|
+
)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Hybrid search — fuses semantic (FAISS) and keyword (BM25) results via RRF.
|
|
2
|
+
|
|
3
|
+
Reciprocal Rank Fusion combines two ranked lists into a single list that
|
|
4
|
+
benefits from both semantic understanding and exact keyword matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from semantic_code_intelligence.config.settings import AppConfig, load_config
|
|
14
|
+
from semantic_code_intelligence.embeddings.generator import generate_embeddings
|
|
15
|
+
from semantic_code_intelligence.search.keyword_search import (
|
|
16
|
+
BM25Index,
|
|
17
|
+
KeywordResult,
|
|
18
|
+
_get_bm25,
|
|
19
|
+
keyword_search,
|
|
20
|
+
)
|
|
21
|
+
from semantic_code_intelligence.storage.vector_store import VectorStore
|
|
22
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
23
|
+
|
|
24
|
+
logger = get_logger("search.hybrid")
|
|
25
|
+
|
|
26
|
+
# Default RRF constant (k=60 is standard in literature)
|
|
27
|
+
RRF_K = 60
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class HybridResult:
|
|
32
|
+
"""A search result produced by fusing semantic + keyword rankings."""
|
|
33
|
+
|
|
34
|
+
file_path: str
|
|
35
|
+
start_line: int
|
|
36
|
+
end_line: int
|
|
37
|
+
language: str
|
|
38
|
+
content: str
|
|
39
|
+
score: float # fused RRF score
|
|
40
|
+
semantic_score: float # original cosine similarity (0 if not in semantic)
|
|
41
|
+
keyword_score: float # original BM25 score (0 if not in keyword)
|
|
42
|
+
chunk_index: int
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> dict[str, Any]:
|
|
45
|
+
return {
|
|
46
|
+
"file_path": self.file_path,
|
|
47
|
+
"start_line": self.start_line,
|
|
48
|
+
"end_line": self.end_line,
|
|
49
|
+
"language": self.language,
|
|
50
|
+
"content": self.content,
|
|
51
|
+
"score": round(self.score, 6),
|
|
52
|
+
"semantic_score": round(self.semantic_score, 4),
|
|
53
|
+
"keyword_score": round(self.keyword_score, 4),
|
|
54
|
+
"chunk_index": self.chunk_index,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _chunk_key(meta: Any) -> str:
|
|
59
|
+
"""Unique key for de-duplicating chunks across result lists."""
|
|
60
|
+
return f"{meta.file_path}:{meta.start_line}:{meta.end_line}"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def reciprocal_rank_fusion(
|
|
64
|
+
semantic_ranking: list[tuple[int, float]],
|
|
65
|
+
keyword_ranking: list[tuple[int, float]],
|
|
66
|
+
k: int = RRF_K,
|
|
67
|
+
) -> list[tuple[int, float, float, float]]:
|
|
68
|
+
"""Fuse two ranked lists via Reciprocal Rank Fusion.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
semantic_ranking: [(chunk_index_in_store, cosine_score), ...] ordered best-first.
|
|
72
|
+
keyword_ranking: [(chunk_index_in_store, bm25_score), ...] ordered best-first.
|
|
73
|
+
k: RRF smoothing constant.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
[(chunk_index, fused_score, semantic_score, keyword_score), ...]
|
|
77
|
+
sorted by fused_score descending.
|
|
78
|
+
"""
|
|
79
|
+
scores: dict[int, float] = {}
|
|
80
|
+
sem_scores: dict[int, float] = {}
|
|
81
|
+
kw_scores: dict[int, float] = {}
|
|
82
|
+
|
|
83
|
+
for rank, (idx, score) in enumerate(semantic_ranking):
|
|
84
|
+
scores[idx] = scores.get(idx, 0.0) + 1.0 / (k + rank + 1)
|
|
85
|
+
sem_scores[idx] = score
|
|
86
|
+
|
|
87
|
+
for rank, (idx, score) in enumerate(keyword_ranking):
|
|
88
|
+
scores[idx] = scores.get(idx, 0.0) + 1.0 / (k + rank + 1)
|
|
89
|
+
kw_scores[idx] = score
|
|
90
|
+
|
|
91
|
+
fused = [
|
|
92
|
+
(idx, fused_score, sem_scores.get(idx, 0.0), kw_scores.get(idx, 0.0))
|
|
93
|
+
for idx, fused_score in scores.items()
|
|
94
|
+
]
|
|
95
|
+
fused.sort(key=lambda x: x[1], reverse=True)
|
|
96
|
+
return fused
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def hybrid_search(
|
|
100
|
+
query: str,
|
|
101
|
+
store: VectorStore,
|
|
102
|
+
index_dir: Path,
|
|
103
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
104
|
+
top_k: int = 10,
|
|
105
|
+
semantic_weight: int | None = None,
|
|
106
|
+
keyword_weight: int | None = None,
|
|
107
|
+
) -> list[HybridResult]:
|
|
108
|
+
"""Execute a hybrid search combining semantic and BM25 keyword results.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
query: Natural language or keyword query.
|
|
112
|
+
store: Loaded VectorStore.
|
|
113
|
+
index_dir: Path to index directory.
|
|
114
|
+
model_name: Embedding model name.
|
|
115
|
+
top_k: Number of final results.
|
|
116
|
+
semantic_weight: How many candidates to pull from semantic (default 2×top_k).
|
|
117
|
+
keyword_weight: How many candidates to pull from keyword (default 2×top_k).
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of HybridResult, sorted by fused RRF score.
|
|
121
|
+
"""
|
|
122
|
+
if store.size == 0:
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
candidate_k = top_k * 2
|
|
126
|
+
|
|
127
|
+
# --- Semantic arm ---
|
|
128
|
+
query_embedding = generate_embeddings([query], model_name=model_name)[0]
|
|
129
|
+
sem_raw = store.search(query_embedding, top_k=semantic_weight or candidate_k)
|
|
130
|
+
|
|
131
|
+
# Map (ChunkMetadata, score) → (metadata_index, score)
|
|
132
|
+
# We need the metadata index to identify chunks across both arms
|
|
133
|
+
meta_to_idx: dict[str, int] = {}
|
|
134
|
+
for i, m in enumerate(store.metadata):
|
|
135
|
+
key = _chunk_key(m)
|
|
136
|
+
if key not in meta_to_idx:
|
|
137
|
+
meta_to_idx[key] = i
|
|
138
|
+
|
|
139
|
+
semantic_ranking: list[tuple[int, float]] = []
|
|
140
|
+
for meta, score in sem_raw:
|
|
141
|
+
key = _chunk_key(meta)
|
|
142
|
+
idx = meta_to_idx.get(key, -1)
|
|
143
|
+
if idx >= 0:
|
|
144
|
+
semantic_ranking.append((idx, float(score)))
|
|
145
|
+
|
|
146
|
+
# --- Keyword arm (BM25) ---
|
|
147
|
+
bm25 = _get_bm25(index_dir, store)
|
|
148
|
+
keyword_ranking = bm25.search(query, top_k=keyword_weight or candidate_k)
|
|
149
|
+
|
|
150
|
+
# --- Fusion ---
|
|
151
|
+
fused = reciprocal_rank_fusion(semantic_ranking, keyword_ranking)
|
|
152
|
+
|
|
153
|
+
results: list[HybridResult] = []
|
|
154
|
+
for idx, fused_score, sem_score, kw_score in fused[:top_k]:
|
|
155
|
+
meta = store.metadata[idx]
|
|
156
|
+
results.append(
|
|
157
|
+
HybridResult(
|
|
158
|
+
file_path=meta.file_path,
|
|
159
|
+
start_line=meta.start_line,
|
|
160
|
+
end_line=meta.end_line,
|
|
161
|
+
language=meta.language,
|
|
162
|
+
content=meta.content,
|
|
163
|
+
score=fused_score,
|
|
164
|
+
semantic_score=sem_score,
|
|
165
|
+
keyword_score=kw_score,
|
|
166
|
+
chunk_index=meta.chunk_index,
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return results
|