codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""Search service — handles semantic, keyword, regex, and hybrid search.
|
|
2
|
+
|
|
3
|
+
Supports four modes:
|
|
4
|
+
- **semantic** (default): FAISS cosine-similarity search
|
|
5
|
+
- **keyword**: BM25-ranked keyword search
|
|
6
|
+
- **regex**: grep-compatible regex pattern matching
|
|
7
|
+
- **hybrid**: Reciprocal Rank Fusion of semantic + BM25
|
|
8
|
+
|
|
9
|
+
Also supports ``--full-section`` expansion and auto-indexing.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Literal
|
|
17
|
+
|
|
18
|
+
from semantic_code_intelligence.config.settings import AppConfig, load_config
|
|
19
|
+
from semantic_code_intelligence.embeddings.generator import generate_embeddings
|
|
20
|
+
from semantic_code_intelligence.storage.query_history import QueryHistory
|
|
21
|
+
from semantic_code_intelligence.storage.vector_store import ChunkMetadata, VectorStore
|
|
22
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
23
|
+
|
|
24
|
+
logger = get_logger("services.search")
|
|
25
|
+
|
|
26
|
+
SearchMode = Literal["semantic", "keyword", "regex", "hybrid"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SearchResult:
|
|
31
|
+
"""A single search result with metadata and similarity score."""
|
|
32
|
+
|
|
33
|
+
file_path: str
|
|
34
|
+
start_line: int
|
|
35
|
+
end_line: int
|
|
36
|
+
language: str
|
|
37
|
+
content: str
|
|
38
|
+
score: float
|
|
39
|
+
chunk_index: int
|
|
40
|
+
|
|
41
|
+
def to_dict(self) -> dict[str, Any]:
|
|
42
|
+
"""Convert to a JSON-serializable dictionary."""
|
|
43
|
+
return {
|
|
44
|
+
"file_path": self.file_path,
|
|
45
|
+
"start_line": self.start_line,
|
|
46
|
+
"end_line": self.end_line,
|
|
47
|
+
"language": self.language,
|
|
48
|
+
"content": self.content,
|
|
49
|
+
"score": round(self.score, 4),
|
|
50
|
+
"chunk_index": self.chunk_index,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _auto_index_if_needed(project_root: Path, index_dir: Path) -> None:
|
|
55
|
+
"""Run indexing transparently if no vector store exists yet."""
|
|
56
|
+
vectors_path = index_dir / "vectors.faiss"
|
|
57
|
+
if vectors_path.exists():
|
|
58
|
+
return
|
|
59
|
+
logger.info("No index found — auto-indexing %s", project_root)
|
|
60
|
+
from semantic_code_intelligence.services.indexing_service import run_indexing
|
|
61
|
+
run_indexing(project_root, force=False)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _load_store(index_dir: Path) -> VectorStore:
|
|
65
|
+
"""Load the vector store (raises FileNotFoundError if missing)."""
|
|
66
|
+
store = VectorStore.load(index_dir)
|
|
67
|
+
logger.info("Loaded vector store with %d vectors.", store.size)
|
|
68
|
+
return store
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _record_history(
|
|
72
|
+
index_dir: Path,
|
|
73
|
+
query: str,
|
|
74
|
+
results: list[SearchResult],
|
|
75
|
+
) -> None:
|
|
76
|
+
"""Record a query in the persistent history (best-effort)."""
|
|
77
|
+
try:
|
|
78
|
+
history = QueryHistory.load(index_dir)
|
|
79
|
+
languages = sorted(set(r.language for r in results if r.language))
|
|
80
|
+
top_files = list(dict.fromkeys(r.file_path for r in results))[:5]
|
|
81
|
+
history.record(
|
|
82
|
+
query=query,
|
|
83
|
+
result_count=len(results),
|
|
84
|
+
top_score=results[0].score if results else 0.0,
|
|
85
|
+
languages=languages,
|
|
86
|
+
top_files=top_files,
|
|
87
|
+
)
|
|
88
|
+
history.save(index_dir)
|
|
89
|
+
except Exception:
|
|
90
|
+
logger.debug("Failed to record query history.")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# Semantic search (original behaviour)
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def _semantic_search(
|
|
98
|
+
query: str,
|
|
99
|
+
store: VectorStore,
|
|
100
|
+
config: Any,
|
|
101
|
+
top_k: int,
|
|
102
|
+
threshold: float,
|
|
103
|
+
) -> list[SearchResult]:
|
|
104
|
+
query_embedding = generate_embeddings(
|
|
105
|
+
[query], model_name=config.embedding.model_name,
|
|
106
|
+
)[0]
|
|
107
|
+
raw_results = store.search(query_embedding, top_k=top_k)
|
|
108
|
+
|
|
109
|
+
results: list[SearchResult] = []
|
|
110
|
+
for meta, score in raw_results:
|
|
111
|
+
if score < threshold:
|
|
112
|
+
continue
|
|
113
|
+
results.append(
|
|
114
|
+
SearchResult(
|
|
115
|
+
file_path=meta.file_path,
|
|
116
|
+
start_line=meta.start_line,
|
|
117
|
+
end_line=meta.end_line,
|
|
118
|
+
language=meta.language,
|
|
119
|
+
content=meta.content,
|
|
120
|
+
score=score,
|
|
121
|
+
chunk_index=meta.chunk_index,
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
return results
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ------------------------------------------------------------------
|
|
128
|
+
# Keyword / regex / hybrid helpers
|
|
129
|
+
# ------------------------------------------------------------------
|
|
130
|
+
|
|
131
|
+
def _keyword_search(
|
|
132
|
+
query: str,
|
|
133
|
+
store: VectorStore,
|
|
134
|
+
index_dir: Path,
|
|
135
|
+
top_k: int,
|
|
136
|
+
) -> list[SearchResult]:
|
|
137
|
+
from semantic_code_intelligence.search.keyword_search import keyword_search
|
|
138
|
+
hits = keyword_search(query, store, index_dir, top_k=top_k)
|
|
139
|
+
return [
|
|
140
|
+
SearchResult(
|
|
141
|
+
file_path=h.file_path,
|
|
142
|
+
start_line=h.start_line,
|
|
143
|
+
end_line=h.end_line,
|
|
144
|
+
language=h.language,
|
|
145
|
+
content=h.content,
|
|
146
|
+
score=h.score,
|
|
147
|
+
chunk_index=h.chunk_index,
|
|
148
|
+
)
|
|
149
|
+
for h in hits
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _regex_search(
|
|
154
|
+
pattern: str,
|
|
155
|
+
store: VectorStore,
|
|
156
|
+
top_k: int,
|
|
157
|
+
case_insensitive: bool = True,
|
|
158
|
+
) -> list[SearchResult]:
|
|
159
|
+
from semantic_code_intelligence.search.keyword_search import regex_search
|
|
160
|
+
hits = regex_search(pattern, store, top_k=top_k, case_insensitive=case_insensitive)
|
|
161
|
+
return [
|
|
162
|
+
SearchResult(
|
|
163
|
+
file_path=h.file_path,
|
|
164
|
+
start_line=h.start_line,
|
|
165
|
+
end_line=h.end_line,
|
|
166
|
+
language=h.language,
|
|
167
|
+
content=h.content,
|
|
168
|
+
score=h.score,
|
|
169
|
+
chunk_index=h.chunk_index,
|
|
170
|
+
)
|
|
171
|
+
for h in hits
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _hybrid_search(
|
|
176
|
+
query: str,
|
|
177
|
+
store: VectorStore,
|
|
178
|
+
index_dir: Path,
|
|
179
|
+
config: Any,
|
|
180
|
+
top_k: int,
|
|
181
|
+
) -> list[SearchResult]:
|
|
182
|
+
from semantic_code_intelligence.search.hybrid_search import hybrid_search
|
|
183
|
+
hits = hybrid_search(
|
|
184
|
+
query, store, index_dir,
|
|
185
|
+
model_name=config.embedding.model_name,
|
|
186
|
+
top_k=top_k,
|
|
187
|
+
)
|
|
188
|
+
return [
|
|
189
|
+
SearchResult(
|
|
190
|
+
file_path=h.file_path,
|
|
191
|
+
start_line=h.start_line,
|
|
192
|
+
end_line=h.end_line,
|
|
193
|
+
language=h.language,
|
|
194
|
+
content=h.content,
|
|
195
|
+
score=h.score,
|
|
196
|
+
chunk_index=h.chunk_index,
|
|
197
|
+
)
|
|
198
|
+
for h in hits
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# ------------------------------------------------------------------
|
|
203
|
+
# Public API
|
|
204
|
+
# ------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def search_codebase(
|
|
207
|
+
query: str,
|
|
208
|
+
project_root: Path,
|
|
209
|
+
top_k: int | None = None,
|
|
210
|
+
threshold: float | None = None,
|
|
211
|
+
mode: SearchMode = "semantic",
|
|
212
|
+
full_section: bool = False,
|
|
213
|
+
auto_index: bool = True,
|
|
214
|
+
case_insensitive: bool = True,
|
|
215
|
+
) -> list[SearchResult]:
|
|
216
|
+
"""Search the indexed codebase.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
query: Natural language query, keywords, or regex pattern.
|
|
220
|
+
project_root: Root directory of the project.
|
|
221
|
+
top_k: Number of top results to return. Uses config default if None.
|
|
222
|
+
threshold: Minimum similarity score. Uses config default if None.
|
|
223
|
+
mode: One of ``"semantic"``, ``"keyword"``, ``"regex"``, ``"hybrid"``.
|
|
224
|
+
full_section: If True, expand results to full enclosing function/class.
|
|
225
|
+
auto_index: If True, index automatically when no index exists.
|
|
226
|
+
case_insensitive: For regex mode, whether to ignore case.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
List of SearchResult objects sorted by descending score.
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
FileNotFoundError: If no vector index exists and auto_index is False.
|
|
233
|
+
"""
|
|
234
|
+
project_root = project_root.resolve()
|
|
235
|
+
config = load_config(project_root)
|
|
236
|
+
index_dir = AppConfig.index_dir(project_root)
|
|
237
|
+
|
|
238
|
+
top_k = top_k or config.search.top_k
|
|
239
|
+
threshold = threshold if threshold is not None else config.search.similarity_threshold
|
|
240
|
+
|
|
241
|
+
# Auto-index if needed
|
|
242
|
+
if auto_index:
|
|
243
|
+
_auto_index_if_needed(project_root, index_dir)
|
|
244
|
+
|
|
245
|
+
store = _load_store(index_dir)
|
|
246
|
+
if store.size == 0:
|
|
247
|
+
return []
|
|
248
|
+
|
|
249
|
+
# Dispatch to the appropriate search backend
|
|
250
|
+
if mode == "keyword":
|
|
251
|
+
results = _keyword_search(query, store, index_dir, top_k)
|
|
252
|
+
elif mode == "regex":
|
|
253
|
+
results = _regex_search(query, store, top_k, case_insensitive)
|
|
254
|
+
elif mode == "hybrid":
|
|
255
|
+
results = _hybrid_search(query, store, index_dir, config, top_k)
|
|
256
|
+
else:
|
|
257
|
+
results = _semantic_search(query, store, config, top_k, threshold)
|
|
258
|
+
|
|
259
|
+
logger.info("Found %d results (mode=%s).", len(results), mode)
|
|
260
|
+
|
|
261
|
+
# Full-section expansion
|
|
262
|
+
if full_section and results:
|
|
263
|
+
from semantic_code_intelligence.search.section_expander import expand_to_full_section
|
|
264
|
+
results = expand_to_full_section(results, project_root, index_dir)
|
|
265
|
+
|
|
266
|
+
# Record query history
|
|
267
|
+
_record_history(index_dir, query, results)
|
|
268
|
+
|
|
269
|
+
return results
|
|
File without changes
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Chunk hash store — tracks content hashes at the individual chunk level.
|
|
2
|
+
|
|
3
|
+
Enables chunk-level incremental indexing: when a file changes, only the
|
|
4
|
+
chunks whose content actually differs are re-embedded, while unchanged
|
|
5
|
+
chunks keep their existing vectors.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger("storage.chunk_hashes")
|
|
17
|
+
|
|
18
|
+
CHUNK_HASH_FILE = "chunk_hashes.json"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def compute_chunk_hash(content: str) -> str:
|
|
22
|
+
"""Compute a fast SHA-256 hash for a chunk's content."""
|
|
23
|
+
return hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ChunkHashStore:
|
|
27
|
+
"""Persists a mapping of chunk keys to content hashes.
|
|
28
|
+
|
|
29
|
+
Chunk key format: ``"file_path:start_line:end_line"``
|
|
30
|
+
|
|
31
|
+
Used for chunk-level incremental indexing: only re-embed chunks
|
|
32
|
+
whose content has actually changed, even if the file was modified.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self) -> None:
|
|
36
|
+
self._hashes: dict[str, str] = {}
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def chunk_key(file_path: str, start_line: int, end_line: int) -> str:
|
|
40
|
+
return f"{file_path}:{start_line}:{end_line}"
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def count(self) -> int:
|
|
44
|
+
return len(self._hashes)
|
|
45
|
+
|
|
46
|
+
def get(self, key: str) -> str | None:
|
|
47
|
+
return self._hashes.get(key)
|
|
48
|
+
|
|
49
|
+
def set(self, key: str, content_hash: str) -> None:
|
|
50
|
+
self._hashes[key] = content_hash
|
|
51
|
+
|
|
52
|
+
def has_changed(self, key: str, content_hash: str) -> bool:
|
|
53
|
+
stored = self._hashes.get(key)
|
|
54
|
+
return stored != content_hash
|
|
55
|
+
|
|
56
|
+
def remove(self, key: str) -> None:
|
|
57
|
+
self._hashes.pop(key, None)
|
|
58
|
+
|
|
59
|
+
def remove_by_file(self, file_path: str) -> int:
|
|
60
|
+
"""Remove all chunk entries whose key starts with file_path."""
|
|
61
|
+
prefix = file_path + ":"
|
|
62
|
+
keys_to_remove = [k for k in self._hashes if k.startswith(prefix)]
|
|
63
|
+
for k in keys_to_remove:
|
|
64
|
+
del self._hashes[k]
|
|
65
|
+
return len(keys_to_remove)
|
|
66
|
+
|
|
67
|
+
def keys_for_file(self, file_path: str) -> list[str]:
|
|
68
|
+
prefix = file_path + ":"
|
|
69
|
+
return [k for k in self._hashes if k.startswith(prefix)]
|
|
70
|
+
|
|
71
|
+
def save(self, directory: Path) -> None:
|
|
72
|
+
directory = Path(directory)
|
|
73
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
path = directory / CHUNK_HASH_FILE
|
|
75
|
+
path.write_text(
|
|
76
|
+
json.dumps(self._hashes, ensure_ascii=False),
|
|
77
|
+
encoding="utf-8",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def load(cls, directory: Path) -> "ChunkHashStore":
|
|
82
|
+
store = cls()
|
|
83
|
+
path = Path(directory) / CHUNK_HASH_FILE
|
|
84
|
+
if path.exists():
|
|
85
|
+
store._hashes = json.loads(path.read_text(encoding="utf-8"))
|
|
86
|
+
return store
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Hash store — tracks file content hashes for incremental indexing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger("storage.hashes")
|
|
11
|
+
|
|
12
|
+
HASH_FILE_NAME = "file_hashes.json"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class HashStore:
|
|
16
|
+
"""Persists a mapping of file paths to content hashes.
|
|
17
|
+
|
|
18
|
+
Used for incremental indexing: only re-index files whose hash has changed.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
self._hashes: dict[str, str] = {}
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def count(self) -> int:
|
|
26
|
+
"""Number of tracked files."""
|
|
27
|
+
return len(self._hashes)
|
|
28
|
+
|
|
29
|
+
def get(self, file_path: str) -> str | None:
|
|
30
|
+
"""Get the stored hash for a file path."""
|
|
31
|
+
return self._hashes.get(file_path)
|
|
32
|
+
|
|
33
|
+
def set(self, file_path: str, content_hash: str) -> None:
|
|
34
|
+
"""Store or update the hash for a file path."""
|
|
35
|
+
self._hashes[file_path] = content_hash
|
|
36
|
+
|
|
37
|
+
def has_changed(self, file_path: str, content_hash: str) -> bool:
|
|
38
|
+
"""Check if a file's content has changed since last indexed.
|
|
39
|
+
|
|
40
|
+
Returns True if the file is new or its hash differs.
|
|
41
|
+
"""
|
|
42
|
+
stored = self._hashes.get(file_path)
|
|
43
|
+
return stored != content_hash
|
|
44
|
+
|
|
45
|
+
def remove(self, file_path: str) -> None:
|
|
46
|
+
"""Remove a file from the hash store."""
|
|
47
|
+
self._hashes.pop(file_path, None)
|
|
48
|
+
|
|
49
|
+
def save(self, directory: Path) -> None:
|
|
50
|
+
"""Save hashes to disk."""
|
|
51
|
+
directory = Path(directory)
|
|
52
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
path = directory / HASH_FILE_NAME
|
|
54
|
+
path.write_text(
|
|
55
|
+
json.dumps(self._hashes, ensure_ascii=False, indent=2),
|
|
56
|
+
encoding="utf-8",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def load(cls, directory: Path) -> "HashStore":
|
|
61
|
+
"""Load hashes from disk. Returns empty store if file doesn't exist."""
|
|
62
|
+
store = cls()
|
|
63
|
+
path = Path(directory) / HASH_FILE_NAME
|
|
64
|
+
if path.exists():
|
|
65
|
+
store._hashes = json.loads(path.read_text(encoding="utf-8"))
|
|
66
|
+
return store
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Index manifest — versioned metadata for the persistent intelligence index.
|
|
2
|
+
|
|
3
|
+
Tracks index schema version, embedding model, creation/update timestamps,
|
|
4
|
+
and file counts to enable integrity checks and safe index upgrades.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import asdict, dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
MANIFEST_FILE = "index_manifest.json"
|
|
16
|
+
SCHEMA_VERSION = 1
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class IndexManifest:
|
|
21
|
+
"""Metadata describing a persisted intelligence index."""
|
|
22
|
+
|
|
23
|
+
schema_version: int = SCHEMA_VERSION
|
|
24
|
+
embedding_model: str = "all-MiniLM-L6-v2"
|
|
25
|
+
embedding_dimension: int = 384
|
|
26
|
+
created_at: float = 0.0
|
|
27
|
+
updated_at: float = 0.0
|
|
28
|
+
total_files: int = 0
|
|
29
|
+
total_chunks: int = 0
|
|
30
|
+
total_symbols: int = 0
|
|
31
|
+
languages: list[str] = field(default_factory=list)
|
|
32
|
+
project_root: str = ""
|
|
33
|
+
|
|
34
|
+
# ------------------------------------------------------------------
|
|
35
|
+
# Serialisation
|
|
36
|
+
# ------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
def to_dict(self) -> dict[str, Any]:
|
|
39
|
+
return asdict(self)
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def from_dict(cls, data: dict[str, Any]) -> IndexManifest:
|
|
43
|
+
known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
|
|
44
|
+
filtered = {k: v for k, v in data.items() if k in known}
|
|
45
|
+
return cls(**filtered)
|
|
46
|
+
|
|
47
|
+
# ------------------------------------------------------------------
|
|
48
|
+
# Persistence
|
|
49
|
+
# ------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
def save(self, directory: str | Path) -> None:
|
|
52
|
+
"""Write the manifest to *directory*/index_manifest.json."""
|
|
53
|
+
path = Path(directory)
|
|
54
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
(path / MANIFEST_FILE).write_text(
|
|
56
|
+
json.dumps(self.to_dict(), indent=2, ensure_ascii=False),
|
|
57
|
+
encoding="utf-8",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def load(cls, directory: str | Path) -> IndexManifest | None:
|
|
62
|
+
"""Load an existing manifest, or return ``None`` if absent."""
|
|
63
|
+
path = Path(directory) / MANIFEST_FILE
|
|
64
|
+
if not path.exists():
|
|
65
|
+
return None
|
|
66
|
+
try:
|
|
67
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
68
|
+
return cls.from_dict(data)
|
|
69
|
+
except (json.JSONDecodeError, OSError):
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
# ------------------------------------------------------------------
|
|
73
|
+
# Helpers
|
|
74
|
+
# ------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def touch(self) -> None:
|
|
77
|
+
"""Update ``updated_at`` to now; set ``created_at`` if zero."""
|
|
78
|
+
now = time.time()
|
|
79
|
+
if self.created_at == 0.0:
|
|
80
|
+
self.created_at = now
|
|
81
|
+
self.updated_at = now
|
|
82
|
+
|
|
83
|
+
def is_compatible(self, model: str, dimension: int) -> bool:
|
|
84
|
+
"""Check whether the index was built with the given model/dimension."""
|
|
85
|
+
return self.embedding_model == model and self.embedding_dimension == dimension
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Index statistics — health metrics, coverage, and staleness tracking.
|
|
2
|
+
|
|
3
|
+
Provides detailed statistics about the intelligence index including
|
|
4
|
+
per-language coverage, chunk distribution, and staleness metrics
|
|
5
|
+
for monitoring index quality.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import asdict, dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
STATS_FILE = "index_stats.json"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class LanguageCoverage:
|
|
21
|
+
"""Per-language indexing statistics."""
|
|
22
|
+
|
|
23
|
+
language: str = ""
|
|
24
|
+
files: int = 0
|
|
25
|
+
chunks: int = 0
|
|
26
|
+
symbols: int = 0
|
|
27
|
+
total_lines: int = 0
|
|
28
|
+
|
|
29
|
+
def to_dict(self) -> dict[str, Any]:
|
|
30
|
+
return asdict(self)
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_dict(cls, data: dict[str, Any]) -> LanguageCoverage:
|
|
34
|
+
known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
|
|
35
|
+
return cls(**{k: v for k, v in data.items() if k in known})
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class IndexStats:
|
|
40
|
+
"""Comprehensive index health and coverage statistics."""
|
|
41
|
+
|
|
42
|
+
# Counts
|
|
43
|
+
total_files: int = 0
|
|
44
|
+
total_chunks: int = 0
|
|
45
|
+
total_symbols: int = 0
|
|
46
|
+
total_vectors: int = 0
|
|
47
|
+
|
|
48
|
+
# Timing
|
|
49
|
+
last_indexed_at: float = 0.0
|
|
50
|
+
indexing_duration_seconds: float = 0.0
|
|
51
|
+
|
|
52
|
+
# Per-language breakdown
|
|
53
|
+
language_coverage: list[LanguageCoverage] = field(default_factory=list)
|
|
54
|
+
|
|
55
|
+
# Staleness
|
|
56
|
+
stale_files: int = 0 # files changed since last index
|
|
57
|
+
|
|
58
|
+
# Quality
|
|
59
|
+
avg_chunk_size: float = 0.0
|
|
60
|
+
embedding_model: str = ""
|
|
61
|
+
embedding_dimension: int = 0
|
|
62
|
+
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
# Serialisation
|
|
65
|
+
# ------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
def to_dict(self) -> dict[str, Any]:
|
|
68
|
+
d = asdict(self)
|
|
69
|
+
d["language_coverage"] = [lc.to_dict() for lc in self.language_coverage]
|
|
70
|
+
return d
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_dict(cls, data: dict[str, Any]) -> IndexStats:
|
|
74
|
+
lang_data = data.pop("language_coverage", [])
|
|
75
|
+
known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
|
|
76
|
+
filtered = {k: v for k, v in data.items() if k in known and k != "language_coverage"}
|
|
77
|
+
stats = cls(**filtered)
|
|
78
|
+
stats.language_coverage = [
|
|
79
|
+
LanguageCoverage.from_dict(lc) if isinstance(lc, dict) else lc
|
|
80
|
+
for lc in lang_data
|
|
81
|
+
]
|
|
82
|
+
return stats
|
|
83
|
+
|
|
84
|
+
# ------------------------------------------------------------------
|
|
85
|
+
# Persistence
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
def save(self, directory: str | Path) -> None:
|
|
89
|
+
"""Write stats to disk."""
|
|
90
|
+
path = Path(directory)
|
|
91
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
(path / STATS_FILE).write_text(
|
|
93
|
+
json.dumps(self.to_dict(), indent=2, ensure_ascii=False),
|
|
94
|
+
encoding="utf-8",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def load(cls, directory: str | Path) -> IndexStats | None:
|
|
99
|
+
"""Load stats from disk, or return ``None`` if absent."""
|
|
100
|
+
path = Path(directory) / STATS_FILE
|
|
101
|
+
if not path.exists():
|
|
102
|
+
return None
|
|
103
|
+
try:
|
|
104
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
105
|
+
return cls.from_dict(data)
|
|
106
|
+
except (json.JSONDecodeError, OSError):
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
# ------------------------------------------------------------------
|
|
110
|
+
# Helpers
|
|
111
|
+
# ------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def staleness_seconds(self) -> float:
|
|
115
|
+
"""Seconds since the last indexing run."""
|
|
116
|
+
if self.last_indexed_at == 0.0:
|
|
117
|
+
return 0.0
|
|
118
|
+
return time.time() - self.last_indexed_at
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def languages(self) -> list[str]:
|
|
122
|
+
"""Return all indexed languages."""
|
|
123
|
+
return [lc.language for lc in self.language_coverage]
|
|
124
|
+
|
|
125
|
+
def get_language(self, language: str) -> LanguageCoverage | None:
|
|
126
|
+
"""Return coverage for a specific language."""
|
|
127
|
+
for lc in self.language_coverage:
|
|
128
|
+
if lc.language == language:
|
|
129
|
+
return lc
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
def set_language(self, coverage: LanguageCoverage) -> None:
|
|
133
|
+
"""Add or replace per-language coverage entry."""
|
|
134
|
+
for i, lc in enumerate(self.language_coverage):
|
|
135
|
+
if lc.language == coverage.language:
|
|
136
|
+
self.language_coverage[i] = coverage
|
|
137
|
+
return
|
|
138
|
+
self.language_coverage.append(coverage)
|