codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
"""Keyword and regex search engine — BM25 scoring + regex matching.
|
|
2
|
+
|
|
3
|
+
Provides grep-compatible text search and BM25-ranked keyword search
|
|
4
|
+
over indexed code chunks, without requiring external dependencies.
|
|
5
|
+
Supports persistent BM25 index serialization for fast startup.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from semantic_code_intelligence.storage.vector_store import ChunkMetadata, VectorStore
|
|
18
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
19
|
+
|
|
20
|
+
logger = get_logger("search.keyword")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class KeywordResult:
|
|
25
|
+
"""A single keyword/regex search result."""
|
|
26
|
+
|
|
27
|
+
file_path: str
|
|
28
|
+
start_line: int
|
|
29
|
+
end_line: int
|
|
30
|
+
language: str
|
|
31
|
+
content: str
|
|
32
|
+
score: float
|
|
33
|
+
chunk_index: int
|
|
34
|
+
match_count: int
|
|
35
|
+
matched_lines: list[int]
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict[str, Any]:
|
|
38
|
+
return {
|
|
39
|
+
"file_path": self.file_path,
|
|
40
|
+
"start_line": self.start_line,
|
|
41
|
+
"end_line": self.end_line,
|
|
42
|
+
"language": self.language,
|
|
43
|
+
"content": self.content,
|
|
44
|
+
"score": round(self.score, 4),
|
|
45
|
+
"chunk_index": self.chunk_index,
|
|
46
|
+
"match_count": self.match_count,
|
|
47
|
+
"matched_lines": self.matched_lines,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
# BM25 Scorer
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _tokenize(text: str) -> list[str]:
|
|
57
|
+
"""Split text into tokens by camelCase boundaries, underscores, and whitespace."""
|
|
58
|
+
# First split camelCase: "getValue" -> ["get", "Value"]
|
|
59
|
+
# Then split on underscores, digits separated
|
|
60
|
+
parts = re.findall(r"[a-z]+|[A-Z][a-z]*|[0-9]+", text)
|
|
61
|
+
return parts
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _lower_tokens(text: str) -> list[str]:
|
|
65
|
+
return [t.lower() for t in _tokenize(text)]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class BM25Index:
|
|
69
|
+
"""A lightweight in-memory BM25 index over chunk metadata.
|
|
70
|
+
|
|
71
|
+
Built lazily from a VectorStore's metadata list so we can share
|
|
72
|
+
the same stored chunks for both semantic and keyword search.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
k1: float = 1.5
|
|
76
|
+
b: float = 0.75
|
|
77
|
+
|
|
78
|
+
def __init__(self, metadata: list[ChunkMetadata]) -> None:
|
|
79
|
+
self.metadata = metadata
|
|
80
|
+
self.n = len(metadata)
|
|
81
|
+
self.doc_tokens: list[list[str]] = []
|
|
82
|
+
self.doc_lengths: list[int] = []
|
|
83
|
+
self.avgdl: float = 0.0
|
|
84
|
+
# term -> {doc_idx: term_freq}
|
|
85
|
+
self.inverted: dict[str, dict[int, int]] = {}
|
|
86
|
+
self._build()
|
|
87
|
+
|
|
88
|
+
def _build(self) -> None:
|
|
89
|
+
total_len = 0
|
|
90
|
+
for idx, meta in enumerate(self.metadata):
|
|
91
|
+
tokens = _lower_tokens(meta.content)
|
|
92
|
+
self.doc_tokens.append(tokens)
|
|
93
|
+
self.doc_lengths.append(len(tokens))
|
|
94
|
+
total_len += len(tokens)
|
|
95
|
+
seen: dict[str, int] = {}
|
|
96
|
+
for tok in tokens:
|
|
97
|
+
seen[tok] = seen.get(tok, 0) + 1
|
|
98
|
+
for tok, freq in seen.items():
|
|
99
|
+
if tok not in self.inverted:
|
|
100
|
+
self.inverted[tok] = {}
|
|
101
|
+
self.inverted[tok][idx] = freq
|
|
102
|
+
self.avgdl = total_len / self.n if self.n else 1.0
|
|
103
|
+
|
|
104
|
+
def search(self, query: str, top_k: int = 10) -> list[tuple[int, float]]:
|
|
105
|
+
"""Return (doc_index, bm25_score) pairs sorted descending."""
|
|
106
|
+
query_tokens = _lower_tokens(query)
|
|
107
|
+
if not query_tokens:
|
|
108
|
+
return []
|
|
109
|
+
|
|
110
|
+
scores: dict[int, float] = {}
|
|
111
|
+
for token in set(query_tokens):
|
|
112
|
+
postings = self.inverted.get(token)
|
|
113
|
+
if not postings:
|
|
114
|
+
continue
|
|
115
|
+
df = len(postings)
|
|
116
|
+
idf = math.log((self.n - df + 0.5) / (df + 0.5) + 1.0)
|
|
117
|
+
for doc_idx, tf in postings.items():
|
|
118
|
+
dl = self.doc_lengths[doc_idx]
|
|
119
|
+
numerator = tf * (self.k1 + 1)
|
|
120
|
+
denominator = tf + self.k1 * (1 - self.b + self.b * dl / self.avgdl)
|
|
121
|
+
scores[doc_idx] = scores.get(doc_idx, 0.0) + idf * numerator / denominator
|
|
122
|
+
|
|
123
|
+
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
124
|
+
return ranked[:top_k]
|
|
125
|
+
|
|
126
|
+
def save(self, directory: Path) -> None:
|
|
127
|
+
"""Persist BM25 index to disk for fast reload.
|
|
128
|
+
|
|
129
|
+
Saves inverted index, doc lengths, and stats as JSON.
|
|
130
|
+
"""
|
|
131
|
+
directory = Path(directory)
|
|
132
|
+
bm25_path = directory / "bm25_index.json"
|
|
133
|
+
data = {
|
|
134
|
+
"n": self.n,
|
|
135
|
+
"avgdl": self.avgdl,
|
|
136
|
+
"doc_lengths": self.doc_lengths,
|
|
137
|
+
# Convert int keys to strings for JSON
|
|
138
|
+
"inverted": {
|
|
139
|
+
term: {str(k): v for k, v in postings.items()}
|
|
140
|
+
for term, postings in self.inverted.items()
|
|
141
|
+
},
|
|
142
|
+
}
|
|
143
|
+
bm25_path.write_text(
|
|
144
|
+
json.dumps(data, ensure_ascii=False),
|
|
145
|
+
encoding="utf-8",
|
|
146
|
+
)
|
|
147
|
+
logger.debug("Saved BM25 index (%d docs, %d terms) to %s",
|
|
148
|
+
self.n, len(self.inverted), directory)
|
|
149
|
+
|
|
150
|
+
@classmethod
|
|
151
|
+
def load(cls, directory: Path, metadata: list[ChunkMetadata]) -> "BM25Index | None":
|
|
152
|
+
"""Load a persisted BM25 index if available and valid.
|
|
153
|
+
|
|
154
|
+
Returns None if the file doesn't exist or the doc count doesn't
|
|
155
|
+
match (indicating the FAISS index has changed).
|
|
156
|
+
"""
|
|
157
|
+
bm25_path = Path(directory) / "bm25_index.json"
|
|
158
|
+
if not bm25_path.exists():
|
|
159
|
+
return None
|
|
160
|
+
try:
|
|
161
|
+
data = json.loads(bm25_path.read_text(encoding="utf-8"))
|
|
162
|
+
if data["n"] != len(metadata):
|
|
163
|
+
logger.debug("BM25 cache stale (%d vs %d docs), rebuilding.",
|
|
164
|
+
data["n"], len(metadata))
|
|
165
|
+
return None
|
|
166
|
+
idx = cls.__new__(cls)
|
|
167
|
+
idx.metadata = metadata
|
|
168
|
+
idx.n = data["n"]
|
|
169
|
+
idx.avgdl = data["avgdl"]
|
|
170
|
+
idx.doc_lengths = data["doc_lengths"]
|
|
171
|
+
idx.doc_tokens = [] # not needed for search
|
|
172
|
+
idx.inverted = {
|
|
173
|
+
term: {int(k): v for k, v in postings.items()}
|
|
174
|
+
for term, postings in data["inverted"].items()
|
|
175
|
+
}
|
|
176
|
+
logger.debug("Loaded BM25 index from disk (%d docs).", idx.n)
|
|
177
|
+
return idx
|
|
178
|
+
except (json.JSONDecodeError, KeyError, TypeError):
|
|
179
|
+
logger.debug("BM25 cache corrupt, rebuilding.")
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
# Public API
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
_bm25_cache: dict[str, BM25Index] = {}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _get_bm25(index_dir: Path, store: VectorStore) -> BM25Index:
|
|
191
|
+
"""Get or build a BM25 index for the given vector store.
|
|
192
|
+
|
|
193
|
+
Checks (in order): in-memory cache, disk cache, then builds fresh.
|
|
194
|
+
Persists newly built indexes to disk for faster future loads.
|
|
195
|
+
"""
|
|
196
|
+
cache_key = str(index_dir)
|
|
197
|
+
cached = _bm25_cache.get(cache_key)
|
|
198
|
+
if cached is not None and cached.n == store.size:
|
|
199
|
+
return cached
|
|
200
|
+
|
|
201
|
+
# Try loading from disk
|
|
202
|
+
loaded = BM25Index.load(index_dir, store.metadata)
|
|
203
|
+
if loaded is not None:
|
|
204
|
+
_bm25_cache[cache_key] = loaded
|
|
205
|
+
return loaded
|
|
206
|
+
|
|
207
|
+
# Build fresh and persist
|
|
208
|
+
logger.debug("Building BM25 index over %d chunks.", store.size)
|
|
209
|
+
idx = BM25Index(store.metadata)
|
|
210
|
+
idx.save(index_dir)
|
|
211
|
+
_bm25_cache[cache_key] = idx
|
|
212
|
+
return idx
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def keyword_search(
|
|
216
|
+
query: str,
|
|
217
|
+
store: VectorStore,
|
|
218
|
+
index_dir: Path,
|
|
219
|
+
top_k: int = 10,
|
|
220
|
+
threshold: float = 0.0,
|
|
221
|
+
) -> list[KeywordResult]:
|
|
222
|
+
"""BM25-ranked keyword search over indexed chunks.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
query: The search query (natural language or keywords).
|
|
226
|
+
store: Loaded VectorStore with metadata.
|
|
227
|
+
index_dir: Path to index directory (for caching).
|
|
228
|
+
top_k: Max results.
|
|
229
|
+
threshold: Minimum BM25 score.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Sorted list of KeywordResult.
|
|
233
|
+
"""
|
|
234
|
+
if store.size == 0:
|
|
235
|
+
return []
|
|
236
|
+
|
|
237
|
+
bm25 = _get_bm25(index_dir, store)
|
|
238
|
+
hits = bm25.search(query, top_k=top_k)
|
|
239
|
+
|
|
240
|
+
results: list[KeywordResult] = []
|
|
241
|
+
for doc_idx, score in hits:
|
|
242
|
+
if score < threshold:
|
|
243
|
+
continue
|
|
244
|
+
meta = store.metadata[doc_idx]
|
|
245
|
+
results.append(
|
|
246
|
+
KeywordResult(
|
|
247
|
+
file_path=meta.file_path,
|
|
248
|
+
start_line=meta.start_line,
|
|
249
|
+
end_line=meta.end_line,
|
|
250
|
+
language=meta.language,
|
|
251
|
+
content=meta.content,
|
|
252
|
+
score=score,
|
|
253
|
+
chunk_index=meta.chunk_index,
|
|
254
|
+
match_count=0,
|
|
255
|
+
matched_lines=[],
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
return results
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def regex_search(
|
|
262
|
+
pattern: str,
|
|
263
|
+
store: VectorStore,
|
|
264
|
+
top_k: int = 10,
|
|
265
|
+
case_insensitive: bool = True,
|
|
266
|
+
) -> list[KeywordResult]:
|
|
267
|
+
"""Regex/grep-style search over indexed chunks.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
pattern: Regex pattern string.
|
|
271
|
+
store: Loaded VectorStore with metadata.
|
|
272
|
+
top_k: Max results.
|
|
273
|
+
case_insensitive: Whether to use case-insensitive matching.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Sorted list of KeywordResult (scored by match count).
|
|
277
|
+
"""
|
|
278
|
+
if store.size == 0:
|
|
279
|
+
return []
|
|
280
|
+
|
|
281
|
+
flags = re.IGNORECASE if case_insensitive else 0
|
|
282
|
+
try:
|
|
283
|
+
compiled = re.compile(pattern, flags)
|
|
284
|
+
except re.error as exc:
|
|
285
|
+
logger.warning("Invalid regex pattern %r: %s", pattern, exc)
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
results: list[KeywordResult] = []
|
|
289
|
+
for meta in store.metadata:
|
|
290
|
+
lines = meta.content.splitlines()
|
|
291
|
+
matched_lines: list[int] = []
|
|
292
|
+
for i, line in enumerate(lines):
|
|
293
|
+
if compiled.search(line):
|
|
294
|
+
matched_lines.append(meta.start_line + i)
|
|
295
|
+
if matched_lines:
|
|
296
|
+
results.append(
|
|
297
|
+
KeywordResult(
|
|
298
|
+
file_path=meta.file_path,
|
|
299
|
+
start_line=meta.start_line,
|
|
300
|
+
end_line=meta.end_line,
|
|
301
|
+
language=meta.language,
|
|
302
|
+
content=meta.content,
|
|
303
|
+
score=float(len(matched_lines)),
|
|
304
|
+
chunk_index=meta.chunk_index,
|
|
305
|
+
match_count=len(matched_lines),
|
|
306
|
+
matched_lines=matched_lines,
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
results.sort(key=lambda r: r.score, reverse=True)
|
|
311
|
+
return results[:top_k]
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Full-section extraction — expands search results to complete functions/classes.
|
|
2
|
+
|
|
3
|
+
When a search hit lands inside a function or class, this module looks up
|
|
4
|
+
the symbol registry to return the *entire* enclosing symbol body, not
|
|
5
|
+
just the matching chunk.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from semantic_code_intelligence.services.search_service import SearchResult
|
|
14
|
+
from semantic_code_intelligence.storage.symbol_registry import SymbolRegistry
|
|
15
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger("search.section")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _read_lines(file_path: str, start: int, end: int) -> str:
|
|
21
|
+
"""Read lines [start, end] (1-indexed) from a file."""
|
|
22
|
+
try:
|
|
23
|
+
all_lines = Path(file_path).read_text(encoding="utf-8", errors="replace").splitlines()
|
|
24
|
+
except (OSError, PermissionError):
|
|
25
|
+
return ""
|
|
26
|
+
s = max(0, start - 1)
|
|
27
|
+
e = min(len(all_lines), end)
|
|
28
|
+
return "\n".join(all_lines[s:e])
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def expand_to_full_section(
|
|
32
|
+
results: list[SearchResult],
|
|
33
|
+
project_root: Path,
|
|
34
|
+
index_dir: Path,
|
|
35
|
+
) -> list[SearchResult]:
|
|
36
|
+
"""Expand each search result to the full enclosing function/class.
|
|
37
|
+
|
|
38
|
+
If a symbol boundary cannot be found (e.g. unsupported language),
|
|
39
|
+
the original result is returned unchanged.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
results: Search results to expand.
|
|
43
|
+
project_root: Root of the project (for resolving paths).
|
|
44
|
+
index_dir: Index directory (for symbol registry).
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
New list of SearchResult with expanded content and line ranges.
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
registry = SymbolRegistry.load(index_dir)
|
|
51
|
+
except Exception:
|
|
52
|
+
logger.debug("Symbol registry not found; returning results unchanged.")
|
|
53
|
+
return results
|
|
54
|
+
|
|
55
|
+
expanded: list[SearchResult] = []
|
|
56
|
+
seen_keys: set[str] = set()
|
|
57
|
+
|
|
58
|
+
for r in results:
|
|
59
|
+
# Normalise to relative path for registry lookup
|
|
60
|
+
try:
|
|
61
|
+
rel = str(Path(r.file_path).relative_to(project_root))
|
|
62
|
+
except ValueError:
|
|
63
|
+
rel = r.file_path
|
|
64
|
+
|
|
65
|
+
# Find the tightest enclosing symbol
|
|
66
|
+
file_symbols = registry.find_by_file(rel)
|
|
67
|
+
best = None
|
|
68
|
+
best_span = float("inf")
|
|
69
|
+
for sym in file_symbols:
|
|
70
|
+
if sym.start_line <= r.start_line and sym.end_line >= r.end_line:
|
|
71
|
+
span = sym.end_line - sym.start_line
|
|
72
|
+
if span < best_span:
|
|
73
|
+
best = sym
|
|
74
|
+
best_span = span
|
|
75
|
+
|
|
76
|
+
if best is not None:
|
|
77
|
+
start = best.start_line
|
|
78
|
+
end = best.end_line
|
|
79
|
+
dedup_key = f"{r.file_path}:{start}:{end}"
|
|
80
|
+
if dedup_key in seen_keys:
|
|
81
|
+
continue
|
|
82
|
+
seen_keys.add(dedup_key)
|
|
83
|
+
|
|
84
|
+
content = _read_lines(r.file_path, start, end)
|
|
85
|
+
expanded.append(
|
|
86
|
+
SearchResult(
|
|
87
|
+
file_path=r.file_path,
|
|
88
|
+
start_line=start,
|
|
89
|
+
end_line=end,
|
|
90
|
+
language=r.language,
|
|
91
|
+
content=content or r.content,
|
|
92
|
+
score=r.score,
|
|
93
|
+
chunk_index=r.chunk_index,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
else:
|
|
97
|
+
dedup_key = f"{r.file_path}:{r.start_line}:{r.end_line}"
|
|
98
|
+
if dedup_key in seen_keys:
|
|
99
|
+
continue
|
|
100
|
+
seen_keys.add(dedup_key)
|
|
101
|
+
expanded.append(r)
|
|
102
|
+
|
|
103
|
+
return expanded
|
|
File without changes
|