codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
@@ -0,0 +1,630 @@
|
|
|
1
|
+
"""Indexing service — orchestrates scanning, chunking, embedding, and storage.
|
|
2
|
+
|
|
3
|
+
Supports chunk-level incremental indexing: when a file changes, only the
|
|
4
|
+
individual chunks whose content actually differs are re-embedded, while
|
|
5
|
+
unchanged chunks keep their existing vectors (high cache-hit ratio).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from semantic_code_intelligence.config.settings import AppConfig, load_config
|
|
17
|
+
from semantic_code_intelligence.embeddings.generator import (
|
|
18
|
+
generate_embeddings,
|
|
19
|
+
get_embedding_dimension,
|
|
20
|
+
)
|
|
21
|
+
from semantic_code_intelligence.indexing.chunker import CodeChunk, chunk_file, detect_language
|
|
22
|
+
from semantic_code_intelligence.indexing.scanner import ScannedFile, scan_repository
|
|
23
|
+
from semantic_code_intelligence.parsing.parser import Symbol, parse_file
|
|
24
|
+
from semantic_code_intelligence.storage.chunk_hash_store import ChunkHashStore, compute_chunk_hash
|
|
25
|
+
from semantic_code_intelligence.storage.hash_store import HashStore
|
|
26
|
+
from semantic_code_intelligence.storage.index_manifest import IndexManifest
|
|
27
|
+
from semantic_code_intelligence.storage.index_stats import IndexStats, LanguageCoverage
|
|
28
|
+
from semantic_code_intelligence.storage.symbol_registry import SymbolEntry, SymbolRegistry
|
|
29
|
+
from semantic_code_intelligence.storage.vector_store import ChunkMetadata, VectorStore
|
|
30
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
31
|
+
|
|
32
|
+
logger = get_logger("services.indexing")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class IndexingResult:
|
|
36
|
+
"""Results of an indexing operation."""
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
self.files_scanned: int = 0
|
|
40
|
+
self.files_indexed: int = 0
|
|
41
|
+
self.files_skipped: int = 0
|
|
42
|
+
self.chunks_created: int = 0
|
|
43
|
+
self.chunks_reused: int = 0
|
|
44
|
+
self.total_vectors: int = 0
|
|
45
|
+
self.symbols_extracted: int = 0
|
|
46
|
+
|
|
47
|
+
def __repr__(self) -> str:
|
|
48
|
+
return (
|
|
49
|
+
f"IndexingResult(scanned={self.files_scanned}, "
|
|
50
|
+
f"indexed={self.files_indexed}, skipped={self.files_skipped}, "
|
|
51
|
+
f"chunks={self.chunks_created}, reused={self.chunks_reused}, "
|
|
52
|
+
f"vectors={self.total_vectors}, "
|
|
53
|
+
f"symbols={self.symbols_extracted})"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _extract_symbols(
|
|
58
|
+
files_to_index: list[ScannedFile],
|
|
59
|
+
deleted_paths: list[str],
|
|
60
|
+
index_dir: Path,
|
|
61
|
+
force: bool,
|
|
62
|
+
) -> tuple[SymbolRegistry, int]:
|
|
63
|
+
"""Extract symbols from indexed files and update the registry."""
|
|
64
|
+
registry = SymbolRegistry() if force else SymbolRegistry.load(index_dir)
|
|
65
|
+
count = 0
|
|
66
|
+
for dp in deleted_paths:
|
|
67
|
+
registry.remove_file(dp)
|
|
68
|
+
for sf in files_to_index:
|
|
69
|
+
registry.remove_file(sf.relative_path)
|
|
70
|
+
try:
|
|
71
|
+
symbols = parse_file(sf.path)
|
|
72
|
+
entries = [
|
|
73
|
+
SymbolEntry(
|
|
74
|
+
name=sym.name,
|
|
75
|
+
kind=sym.kind,
|
|
76
|
+
file_path=sf.relative_path,
|
|
77
|
+
start_line=sym.start_line,
|
|
78
|
+
end_line=sym.end_line,
|
|
79
|
+
parent=sym.parent,
|
|
80
|
+
parameters=sym.parameters,
|
|
81
|
+
decorators=sym.decorators,
|
|
82
|
+
language=detect_language(str(sf.path)),
|
|
83
|
+
)
|
|
84
|
+
for sym in symbols
|
|
85
|
+
]
|
|
86
|
+
registry.add_many(entries)
|
|
87
|
+
count += len(entries)
|
|
88
|
+
except Exception:
|
|
89
|
+
logger.debug("Symbol extraction failed for %s", sf.relative_path)
|
|
90
|
+
registry.save(index_dir)
|
|
91
|
+
return registry, count
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _compute_index_stats(
|
|
95
|
+
all_chunks: list[CodeChunk],
|
|
96
|
+
registry: SymbolRegistry,
|
|
97
|
+
result: IndexingResult,
|
|
98
|
+
config: "AppConfig",
|
|
99
|
+
dimension: int,
|
|
100
|
+
store_size: int,
|
|
101
|
+
indexing_start: float,
|
|
102
|
+
index_dir: Path,
|
|
103
|
+
) -> None:
|
|
104
|
+
"""Aggregate per-language metrics and persist index stats."""
|
|
105
|
+
indexing_end = time.time()
|
|
106
|
+
|
|
107
|
+
lang_files: dict[str, set[str]] = defaultdict(set)
|
|
108
|
+
lang_chunks: dict[str, int] = defaultdict(int)
|
|
109
|
+
lang_lines: dict[str, int] = defaultdict(int)
|
|
110
|
+
for chunk in all_chunks:
|
|
111
|
+
lang = chunk.language or "unknown"
|
|
112
|
+
lang_files[lang].add(chunk.file_path)
|
|
113
|
+
lang_chunks[lang] += 1
|
|
114
|
+
lang_lines[lang] += chunk.end_line - chunk.start_line + 1
|
|
115
|
+
lang_symbols: dict[str, int] = registry.language_summary()
|
|
116
|
+
|
|
117
|
+
coverage = [
|
|
118
|
+
LanguageCoverage(
|
|
119
|
+
language=lang,
|
|
120
|
+
files=len(files),
|
|
121
|
+
chunks=lang_chunks.get(lang, 0),
|
|
122
|
+
symbols=lang_symbols.get(lang, 0),
|
|
123
|
+
total_lines=lang_lines.get(lang, 0),
|
|
124
|
+
)
|
|
125
|
+
for lang, files in lang_files.items()
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
total_chars = sum(len(c.content) for c in all_chunks)
|
|
129
|
+
stats = IndexStats(
|
|
130
|
+
total_files=result.files_indexed + result.files_skipped,
|
|
131
|
+
total_chunks=store_size,
|
|
132
|
+
total_symbols=registry.size,
|
|
133
|
+
total_vectors=store_size,
|
|
134
|
+
last_indexed_at=indexing_end,
|
|
135
|
+
indexing_duration_seconds=round(indexing_end - indexing_start, 3),
|
|
136
|
+
language_coverage=coverage,
|
|
137
|
+
avg_chunk_size=round(total_chars / len(all_chunks), 1) if all_chunks else 0.0,
|
|
138
|
+
embedding_model=config.embedding.model_name,
|
|
139
|
+
embedding_dimension=dimension,
|
|
140
|
+
)
|
|
141
|
+
stats.save(index_dir)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def run_indexing(
|
|
145
|
+
project_root: Path,
|
|
146
|
+
force: bool = False,
|
|
147
|
+
) -> IndexingResult:
|
|
148
|
+
"""Run the full indexing pipeline for a project.
|
|
149
|
+
|
|
150
|
+
Uses **chunk-level incremental indexing**: when a file changes, each
|
|
151
|
+
chunk is individually hashed and only chunks with new/changed content
|
|
152
|
+
are re-embedded. Unchanged chunks keep their existing vectors,
|
|
153
|
+
achieving high cache-hit ratios (typically 80-90% on incremental runs).
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
project_root: Root directory of the project.
|
|
157
|
+
force: If True, re-index all files regardless of hash cache.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
IndexingResult with statistics.
|
|
161
|
+
"""
|
|
162
|
+
project_root = project_root.resolve()
|
|
163
|
+
config = load_config(project_root)
|
|
164
|
+
index_dir = AppConfig.index_dir(project_root)
|
|
165
|
+
index_dir.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
|
|
167
|
+
indexing_start = time.time()
|
|
168
|
+
result = IndexingResult()
|
|
169
|
+
|
|
170
|
+
# Step 1: Scan repository
|
|
171
|
+
logger.info("Scanning repository: %s", project_root)
|
|
172
|
+
scanned_files = scan_repository(project_root, config.index)
|
|
173
|
+
result.files_scanned = len(scanned_files)
|
|
174
|
+
logger.info("Found %d indexable files.", result.files_scanned)
|
|
175
|
+
|
|
176
|
+
if not scanned_files:
|
|
177
|
+
return result
|
|
178
|
+
|
|
179
|
+
# Step 2: Load hash stores for incremental indexing
|
|
180
|
+
hash_store = HashStore.load(index_dir)
|
|
181
|
+
chunk_hash_store = ChunkHashStore.load(index_dir)
|
|
182
|
+
files_to_index: list[ScannedFile] = []
|
|
183
|
+
scanned_paths = {sf.relative_path for sf in scanned_files}
|
|
184
|
+
|
|
185
|
+
if force:
|
|
186
|
+
files_to_index = scanned_files
|
|
187
|
+
else:
|
|
188
|
+
for sf in scanned_files:
|
|
189
|
+
if hash_store.has_changed(sf.relative_path, sf.content_hash):
|
|
190
|
+
files_to_index.append(sf)
|
|
191
|
+
else:
|
|
192
|
+
result.files_skipped += 1
|
|
193
|
+
|
|
194
|
+
# Detect deleted files: tracked in hash_store but no longer on disk
|
|
195
|
+
deleted_paths: list[str] = []
|
|
196
|
+
if not force:
|
|
197
|
+
for tracked_path in list(hash_store._hashes.keys()):
|
|
198
|
+
if tracked_path not in scanned_paths:
|
|
199
|
+
deleted_paths.append(tracked_path)
|
|
200
|
+
|
|
201
|
+
logger.info(
|
|
202
|
+
"%d files to index (%d skipped, unchanged).",
|
|
203
|
+
len(files_to_index),
|
|
204
|
+
result.files_skipped,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Step 3: Chunk all changed files
|
|
208
|
+
all_chunks: list[CodeChunk] = []
|
|
209
|
+
chunk_file_hashes: list[str] = [] # parallel array: hash for each chunk's file
|
|
210
|
+
|
|
211
|
+
for sf in files_to_index:
|
|
212
|
+
chunks = chunk_file(
|
|
213
|
+
sf.path,
|
|
214
|
+
chunk_size=config.embedding.chunk_size,
|
|
215
|
+
chunk_overlap=config.embedding.chunk_overlap,
|
|
216
|
+
)
|
|
217
|
+
for c in chunks:
|
|
218
|
+
all_chunks.append(c)
|
|
219
|
+
chunk_file_hashes.append(sf.content_hash)
|
|
220
|
+
result.files_indexed += 1
|
|
221
|
+
|
|
222
|
+
result.chunks_created = len(all_chunks)
|
|
223
|
+
logger.info("Created %d code chunks.", result.chunks_created)
|
|
224
|
+
|
|
225
|
+
if not all_chunks:
|
|
226
|
+
# Update hashes even if no chunks (e.g. empty files)
|
|
227
|
+
for sf in files_to_index:
|
|
228
|
+
hash_store.set(sf.relative_path, sf.content_hash)
|
|
229
|
+
|
|
230
|
+
# Still clean up deleted files from the vector store
|
|
231
|
+
if deleted_paths:
|
|
232
|
+
try:
|
|
233
|
+
store = VectorStore.load(index_dir)
|
|
234
|
+
for dp in deleted_paths:
|
|
235
|
+
full = str(project_root / dp)
|
|
236
|
+
store.remove_by_file(full)
|
|
237
|
+
hash_store.remove(dp)
|
|
238
|
+
chunk_hash_store.remove_by_file(full)
|
|
239
|
+
store.save(index_dir)
|
|
240
|
+
except FileNotFoundError:
|
|
241
|
+
pass
|
|
242
|
+
|
|
243
|
+
hash_store.save(index_dir)
|
|
244
|
+
chunk_hash_store.save(index_dir)
|
|
245
|
+
return result
|
|
246
|
+
|
|
247
|
+
# Step 4: Chunk-level delta — separate new/changed chunks from reusable ones
|
|
248
|
+
chunks_to_embed: list[CodeChunk] = []
|
|
249
|
+
chunks_to_embed_file_hashes: list[str] = []
|
|
250
|
+
reused_indices: list[int] = [] # indices into all_chunks that are unchanged
|
|
251
|
+
|
|
252
|
+
if force:
|
|
253
|
+
chunks_to_embed = all_chunks
|
|
254
|
+
chunks_to_embed_file_hashes = chunk_file_hashes
|
|
255
|
+
else:
|
|
256
|
+
for i, chunk in enumerate(all_chunks):
|
|
257
|
+
c_hash = compute_chunk_hash(chunk.content)
|
|
258
|
+
c_key = ChunkHashStore.chunk_key(
|
|
259
|
+
chunk.file_path, chunk.start_line, chunk.end_line,
|
|
260
|
+
)
|
|
261
|
+
if chunk_hash_store.has_changed(c_key, c_hash):
|
|
262
|
+
chunks_to_embed.append(chunk)
|
|
263
|
+
chunks_to_embed_file_hashes.append(chunk_file_hashes[i])
|
|
264
|
+
else:
|
|
265
|
+
reused_indices.append(i)
|
|
266
|
+
|
|
267
|
+
result.chunks_reused = len(reused_indices)
|
|
268
|
+
logger.info(
|
|
269
|
+
"Chunk-level delta: %d to embed, %d reused (cache hit %.0f%%).",
|
|
270
|
+
len(chunks_to_embed),
|
|
271
|
+
result.chunks_reused,
|
|
272
|
+
100 * result.chunks_reused / len(all_chunks) if all_chunks else 0,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Step 5: Generate embeddings only for changed chunks
|
|
276
|
+
new_embeddings: np.ndarray | None = None
|
|
277
|
+
if chunks_to_embed:
|
|
278
|
+
texts = [chunk.content for chunk in chunks_to_embed]
|
|
279
|
+
logger.info("Generating embeddings for %d chunks...", len(texts))
|
|
280
|
+
new_embeddings = generate_embeddings(
|
|
281
|
+
texts,
|
|
282
|
+
model_name=config.embedding.model_name,
|
|
283
|
+
show_progress=True,
|
|
284
|
+
)
|
|
285
|
+
logger.info("Embeddings generated. Shape: %s", new_embeddings.shape)
|
|
286
|
+
|
|
287
|
+
# Step 6: Load or create vector store and reconcile
|
|
288
|
+
if new_embeddings is not None:
|
|
289
|
+
dimension = new_embeddings.shape[1]
|
|
290
|
+
else:
|
|
291
|
+
dimension = get_embedding_dimension(config.embedding.model_name)
|
|
292
|
+
|
|
293
|
+
if force:
|
|
294
|
+
store = VectorStore(dimension)
|
|
295
|
+
cached_vectors: dict[str, np.ndarray] = {}
|
|
296
|
+
else:
|
|
297
|
+
try:
|
|
298
|
+
store = VectorStore.load(index_dir)
|
|
299
|
+
except FileNotFoundError:
|
|
300
|
+
store = VectorStore(dimension)
|
|
301
|
+
|
|
302
|
+
# Extract existing vectors for unchanged chunks BEFORE removing
|
|
303
|
+
cached_vectors = {}
|
|
304
|
+
for sf in files_to_index:
|
|
305
|
+
for meta, vec in store.get_vectors_for_file(str(sf.path)):
|
|
306
|
+
# Key by content hash so we can match reused chunks
|
|
307
|
+
cache_key = f"{meta.file_path}:{meta.start_line}:{meta.end_line}:{meta.content_hash}"
|
|
308
|
+
cached_vectors[cache_key] = vec
|
|
309
|
+
|
|
310
|
+
# Remove stale vectors for changed files before adding updated ones
|
|
311
|
+
for sf in files_to_index:
|
|
312
|
+
store.remove_by_file(str(sf.path))
|
|
313
|
+
|
|
314
|
+
# Remove vectors for deleted files
|
|
315
|
+
for dp in deleted_paths:
|
|
316
|
+
full = str(project_root / dp)
|
|
317
|
+
store.remove_by_file(full)
|
|
318
|
+
hash_store.remove(dp)
|
|
319
|
+
chunk_hash_store.remove_by_file(full)
|
|
320
|
+
|
|
321
|
+
# Step 7: Build final embeddings by combining cached + new vectors
|
|
322
|
+
# For reused chunks, look up their cached vectors instead of re-embedding.
|
|
323
|
+
# Build a lookup from changed-chunk index to its embedding
|
|
324
|
+
new_embed_map: dict[int, int] = {} # all_chunks idx -> new_embeddings idx
|
|
325
|
+
new_idx = 0
|
|
326
|
+
for i, chunk in enumerate(all_chunks):
|
|
327
|
+
if i not in set(reused_indices):
|
|
328
|
+
new_embed_map[i] = new_idx
|
|
329
|
+
new_idx += 1
|
|
330
|
+
|
|
331
|
+
all_embeddings_list: list[np.ndarray] = []
|
|
332
|
+
reembedded_count = 0
|
|
333
|
+
for i, chunk in enumerate(all_chunks):
|
|
334
|
+
if i in set(reused_indices) and not force:
|
|
335
|
+
# Try to reuse cached vector for this chunk
|
|
336
|
+
cache_key = f"{chunk.file_path}:{chunk.start_line}:{chunk.end_line}:{chunk_file_hashes[i]}"
|
|
337
|
+
cached_vec = cached_vectors.get(cache_key)
|
|
338
|
+
if cached_vec is not None:
|
|
339
|
+
all_embeddings_list.append(cached_vec)
|
|
340
|
+
continue
|
|
341
|
+
# Cache miss — chunk positions may have shifted; need to embed
|
|
342
|
+
reembedded_count += 1
|
|
343
|
+
|
|
344
|
+
if i in new_embed_map and new_embeddings is not None:
|
|
345
|
+
all_embeddings_list.append(new_embeddings[new_embed_map[i]])
|
|
346
|
+
else:
|
|
347
|
+
# Fallback: embed this single chunk (rare — only cache misses)
|
|
348
|
+
reembedded_count += 1
|
|
349
|
+
vec = generate_embeddings(
|
|
350
|
+
[chunk.content],
|
|
351
|
+
model_name=config.embedding.model_name,
|
|
352
|
+
show_progress=False,
|
|
353
|
+
)
|
|
354
|
+
all_embeddings_list.append(vec[0])
|
|
355
|
+
|
|
356
|
+
if reembedded_count > 0:
|
|
357
|
+
logger.info("Re-embedded %d chunks (cache miss due to position shift).", reembedded_count)
|
|
358
|
+
|
|
359
|
+
if all_embeddings_list:
|
|
360
|
+
all_embeddings = np.vstack([v.reshape(1, -1) for v in all_embeddings_list]).astype(np.float32)
|
|
361
|
+
else:
|
|
362
|
+
all_embeddings = np.empty((0, dimension), dtype=np.float32)
|
|
363
|
+
|
|
364
|
+
metadata_list = [
|
|
365
|
+
ChunkMetadata(
|
|
366
|
+
file_path=chunk.file_path,
|
|
367
|
+
start_line=chunk.start_line,
|
|
368
|
+
end_line=chunk.end_line,
|
|
369
|
+
chunk_index=chunk.chunk_index,
|
|
370
|
+
language=chunk.language,
|
|
371
|
+
content=chunk.content,
|
|
372
|
+
content_hash=chunk_file_hashes[i],
|
|
373
|
+
)
|
|
374
|
+
for i, chunk in enumerate(all_chunks)
|
|
375
|
+
]
|
|
376
|
+
|
|
377
|
+
store.add(all_embeddings, metadata_list)
|
|
378
|
+
store.save(index_dir)
|
|
379
|
+
|
|
380
|
+
# Step 8: Update hash stores
|
|
381
|
+
for sf in files_to_index:
|
|
382
|
+
hash_store.set(sf.relative_path, sf.content_hash)
|
|
383
|
+
# Update chunk-level hashes
|
|
384
|
+
for chunk in all_chunks:
|
|
385
|
+
c_key = ChunkHashStore.chunk_key(
|
|
386
|
+
chunk.file_path, chunk.start_line, chunk.end_line,
|
|
387
|
+
)
|
|
388
|
+
chunk_hash_store.set(c_key, compute_chunk_hash(chunk.content))
|
|
389
|
+
|
|
390
|
+
hash_store.save(index_dir)
|
|
391
|
+
chunk_hash_store.save(index_dir)
|
|
392
|
+
|
|
393
|
+
result.total_vectors = store.size
|
|
394
|
+
|
|
395
|
+
# Step 9: Extract symbols and populate registry
|
|
396
|
+
registry, result.symbols_extracted = _extract_symbols(
|
|
397
|
+
files_to_index, deleted_paths, index_dir, force,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Step 10: Update index manifest
|
|
401
|
+
manifest = IndexManifest.load(index_dir) or IndexManifest()
|
|
402
|
+
manifest.embedding_model = config.embedding.model_name
|
|
403
|
+
manifest.embedding_dimension = dimension
|
|
404
|
+
manifest.project_root = str(project_root)
|
|
405
|
+
manifest.total_files = result.files_indexed + result.files_skipped
|
|
406
|
+
manifest.total_chunks = store.size
|
|
407
|
+
manifest.total_symbols = registry.size
|
|
408
|
+
manifest.languages = sorted(set(
|
|
409
|
+
chunk.language for chunk in all_chunks if chunk.language != "unknown"
|
|
410
|
+
))
|
|
411
|
+
manifest.touch()
|
|
412
|
+
manifest.save(index_dir)
|
|
413
|
+
|
|
414
|
+
# Step 11: Compute and persist index stats
|
|
415
|
+
_compute_index_stats(
|
|
416
|
+
all_chunks, registry, result, config,
|
|
417
|
+
dimension, store.size, indexing_start, index_dir,
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
logger.info("Indexing complete. %s", result)
|
|
421
|
+
return result
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# ── Per-file incremental indexing (Phase 27) ─────────────────────────
|
|
425
|
+
|
|
426
|
+
def run_incremental_indexing(
|
|
427
|
+
project_root: Path,
|
|
428
|
+
changed_files: list[str],
|
|
429
|
+
deleted_files: list[str] | None = None,
|
|
430
|
+
) -> IndexingResult:
|
|
431
|
+
"""Re-index only specific changed/deleted files without a full repo scan.
|
|
432
|
+
|
|
433
|
+
This is the key performance optimisation for the daemon: instead of
|
|
434
|
+
scanning the entire repository on every file change, it processes only
|
|
435
|
+
the files that the watcher detected as created/modified/deleted.
|
|
436
|
+
|
|
437
|
+
Changed files are re-chunked, re-embedded, and their vectors replaced
|
|
438
|
+
in the FAISS store. Deleted files have their vectors, hashes, and
|
|
439
|
+
symbols removed.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
project_root: Root directory of the project.
|
|
443
|
+
changed_files: Absolute paths of files that were created or modified.
|
|
444
|
+
deleted_files: Absolute paths of files that were deleted.
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
IndexingResult with statistics for the incremental operation.
|
|
448
|
+
"""
|
|
449
|
+
project_root = project_root.resolve()
|
|
450
|
+
config = load_config(project_root)
|
|
451
|
+
index_dir = AppConfig.index_dir(project_root)
|
|
452
|
+
index_dir.mkdir(parents=True, exist_ok=True)
|
|
453
|
+
deleted_files = deleted_files or []
|
|
454
|
+
|
|
455
|
+
indexing_start = time.time()
|
|
456
|
+
result = IndexingResult()
|
|
457
|
+
|
|
458
|
+
# Load existing stores
|
|
459
|
+
hash_store = HashStore.load(index_dir)
|
|
460
|
+
chunk_hash_store = ChunkHashStore.load(index_dir)
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
store = VectorStore.load(index_dir)
|
|
464
|
+
except FileNotFoundError:
|
|
465
|
+
# No existing index — fall back to full indexing
|
|
466
|
+
logger.info("No existing index found; falling back to full indexing.")
|
|
467
|
+
return run_indexing(project_root, force=False)
|
|
468
|
+
|
|
469
|
+
dimension = store.dimension
|
|
470
|
+
|
|
471
|
+
# Step 1: Handle deleted files
|
|
472
|
+
deleted_rel: list[str] = []
|
|
473
|
+
for dp in deleted_files:
|
|
474
|
+
p = Path(dp)
|
|
475
|
+
if p.is_absolute():
|
|
476
|
+
try:
|
|
477
|
+
rel = str(p.relative_to(project_root))
|
|
478
|
+
except ValueError:
|
|
479
|
+
rel = str(p)
|
|
480
|
+
else:
|
|
481
|
+
rel = dp
|
|
482
|
+
store.remove_by_file(dp)
|
|
483
|
+
hash_store.remove(rel)
|
|
484
|
+
chunk_hash_store.remove_by_file(dp)
|
|
485
|
+
deleted_rel.append(rel)
|
|
486
|
+
|
|
487
|
+
# Step 2: Process changed files
|
|
488
|
+
scanned_files: list[ScannedFile] = []
|
|
489
|
+
for fp in changed_files:
|
|
490
|
+
p = Path(fp)
|
|
491
|
+
if not p.is_file():
|
|
492
|
+
continue
|
|
493
|
+
try:
|
|
494
|
+
rel = str(p.relative_to(project_root))
|
|
495
|
+
except ValueError:
|
|
496
|
+
rel = str(p)
|
|
497
|
+
content_hash = _safe_compute_hash(p)
|
|
498
|
+
if content_hash is None:
|
|
499
|
+
continue
|
|
500
|
+
scanned_files.append(ScannedFile(
|
|
501
|
+
path=p,
|
|
502
|
+
relative_path=rel,
|
|
503
|
+
extension=p.suffix,
|
|
504
|
+
size_bytes=p.stat().st_size,
|
|
505
|
+
content_hash=content_hash,
|
|
506
|
+
))
|
|
507
|
+
result.files_scanned = len(scanned_files)
|
|
508
|
+
|
|
509
|
+
# Step 3: Filter to files that actually changed (hash check)
|
|
510
|
+
files_to_index: list[ScannedFile] = []
|
|
511
|
+
for sf in scanned_files:
|
|
512
|
+
if hash_store.has_changed(sf.relative_path, sf.content_hash):
|
|
513
|
+
files_to_index.append(sf)
|
|
514
|
+
else:
|
|
515
|
+
result.files_skipped += 1
|
|
516
|
+
|
|
517
|
+
if not files_to_index and not deleted_files:
|
|
518
|
+
logger.info("Incremental: nothing to update.")
|
|
519
|
+
return result
|
|
520
|
+
|
|
521
|
+
# Step 4: Chunk changed files
|
|
522
|
+
all_chunks: list[CodeChunk] = []
|
|
523
|
+
chunk_file_hashes: list[str] = []
|
|
524
|
+
|
|
525
|
+
for sf in files_to_index:
|
|
526
|
+
# Remove old vectors for this file before re-adding
|
|
527
|
+
store.remove_by_file(str(sf.path))
|
|
528
|
+
chunk_hash_store.remove_by_file(str(sf.path))
|
|
529
|
+
|
|
530
|
+
chunks = chunk_file(
|
|
531
|
+
sf.path,
|
|
532
|
+
chunk_size=config.embedding.chunk_size,
|
|
533
|
+
chunk_overlap=config.embedding.chunk_overlap,
|
|
534
|
+
)
|
|
535
|
+
for c in chunks:
|
|
536
|
+
all_chunks.append(c)
|
|
537
|
+
chunk_file_hashes.append(sf.content_hash)
|
|
538
|
+
result.files_indexed += 1
|
|
539
|
+
|
|
540
|
+
result.chunks_created = len(all_chunks)
|
|
541
|
+
|
|
542
|
+
# Step 5: Embed and add to store
|
|
543
|
+
if all_chunks:
|
|
544
|
+
texts = [chunk.content for chunk in all_chunks]
|
|
545
|
+
logger.info("Incremental: embedding %d chunks from %d files...",
|
|
546
|
+
len(texts), len(files_to_index))
|
|
547
|
+
embeddings = generate_embeddings(
|
|
548
|
+
texts,
|
|
549
|
+
model_name=config.embedding.model_name,
|
|
550
|
+
show_progress=False,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
metadata_list = [
|
|
554
|
+
ChunkMetadata(
|
|
555
|
+
file_path=chunk.file_path,
|
|
556
|
+
start_line=chunk.start_line,
|
|
557
|
+
end_line=chunk.end_line,
|
|
558
|
+
chunk_index=chunk.chunk_index,
|
|
559
|
+
language=chunk.language,
|
|
560
|
+
content=chunk.content,
|
|
561
|
+
content_hash=chunk_file_hashes[i],
|
|
562
|
+
)
|
|
563
|
+
for i, chunk in enumerate(all_chunks)
|
|
564
|
+
]
|
|
565
|
+
|
|
566
|
+
store.add(embeddings, metadata_list)
|
|
567
|
+
|
|
568
|
+
# Step 6: Persist stores
|
|
569
|
+
store.save(index_dir)
|
|
570
|
+
|
|
571
|
+
for sf in files_to_index:
|
|
572
|
+
hash_store.set(sf.relative_path, sf.content_hash)
|
|
573
|
+
for chunk in all_chunks:
|
|
574
|
+
c_key = ChunkHashStore.chunk_key(
|
|
575
|
+
chunk.file_path, chunk.start_line, chunk.end_line,
|
|
576
|
+
)
|
|
577
|
+
chunk_hash_store.set(c_key, compute_chunk_hash(chunk.content))
|
|
578
|
+
|
|
579
|
+
hash_store.save(index_dir)
|
|
580
|
+
chunk_hash_store.save(index_dir)
|
|
581
|
+
result.total_vectors = store.size
|
|
582
|
+
|
|
583
|
+
# Step 7: Update symbol registry for changed files only
|
|
584
|
+
registry = SymbolRegistry.load(index_dir)
|
|
585
|
+
sym_count = 0
|
|
586
|
+
for dp in deleted_rel:
|
|
587
|
+
registry.remove_file(dp)
|
|
588
|
+
for sf in files_to_index:
|
|
589
|
+
registry.remove_file(sf.relative_path)
|
|
590
|
+
try:
|
|
591
|
+
symbols = parse_file(sf.path)
|
|
592
|
+
entries = [
|
|
593
|
+
SymbolEntry(
|
|
594
|
+
name=sym.name,
|
|
595
|
+
kind=sym.kind,
|
|
596
|
+
file_path=sf.relative_path,
|
|
597
|
+
start_line=sym.start_line,
|
|
598
|
+
end_line=sym.end_line,
|
|
599
|
+
parent=sym.parent,
|
|
600
|
+
parameters=sym.parameters,
|
|
601
|
+
decorators=sym.decorators,
|
|
602
|
+
language=detect_language(str(sf.path)),
|
|
603
|
+
)
|
|
604
|
+
for sym in symbols
|
|
605
|
+
]
|
|
606
|
+
registry.add_many(entries)
|
|
607
|
+
sym_count += len(entries)
|
|
608
|
+
except Exception:
|
|
609
|
+
logger.debug("Symbol extraction failed for %s", sf.relative_path)
|
|
610
|
+
registry.save(index_dir)
|
|
611
|
+
result.symbols_extracted = sym_count
|
|
612
|
+
|
|
613
|
+
# Step 8: Update index manifest
|
|
614
|
+
manifest = IndexManifest.load(index_dir) or IndexManifest()
|
|
615
|
+
manifest.total_chunks = store.size
|
|
616
|
+
manifest.total_symbols = registry.size
|
|
617
|
+
manifest.touch()
|
|
618
|
+
manifest.save(index_dir)
|
|
619
|
+
|
|
620
|
+
logger.info("Incremental indexing complete. %s", result)
|
|
621
|
+
return result
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def _safe_compute_hash(path: Path) -> str | None:
|
|
625
|
+
"""Compute file hash, returning None on error."""
|
|
626
|
+
try:
|
|
627
|
+
from semantic_code_intelligence.indexing.scanner import compute_file_hash
|
|
628
|
+
return compute_file_hash(path)
|
|
629
|
+
except (OSError, PermissionError):
|
|
630
|
+
return None
|