codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,630 @@
1
+ """Indexing service — orchestrates scanning, chunking, embedding, and storage.
2
+
3
+ Supports chunk-level incremental indexing: when a file changes, only the
4
+ individual chunks whose content actually differs are re-embedded, while
5
+ unchanged chunks keep their existing vectors (high cache-hit ratio).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from collections import defaultdict
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+
16
+ from semantic_code_intelligence.config.settings import AppConfig, load_config
17
+ from semantic_code_intelligence.embeddings.generator import (
18
+ generate_embeddings,
19
+ get_embedding_dimension,
20
+ )
21
+ from semantic_code_intelligence.indexing.chunker import CodeChunk, chunk_file, detect_language
22
+ from semantic_code_intelligence.indexing.scanner import ScannedFile, scan_repository
23
+ from semantic_code_intelligence.parsing.parser import Symbol, parse_file
24
+ from semantic_code_intelligence.storage.chunk_hash_store import ChunkHashStore, compute_chunk_hash
25
+ from semantic_code_intelligence.storage.hash_store import HashStore
26
+ from semantic_code_intelligence.storage.index_manifest import IndexManifest
27
+ from semantic_code_intelligence.storage.index_stats import IndexStats, LanguageCoverage
28
+ from semantic_code_intelligence.storage.symbol_registry import SymbolEntry, SymbolRegistry
29
+ from semantic_code_intelligence.storage.vector_store import ChunkMetadata, VectorStore
30
+ from semantic_code_intelligence.utils.logging import get_logger
31
+
32
+ logger = get_logger("services.indexing")
33
+
34
+
35
+ class IndexingResult:
36
+ """Results of an indexing operation."""
37
+
38
+ def __init__(self) -> None:
39
+ self.files_scanned: int = 0
40
+ self.files_indexed: int = 0
41
+ self.files_skipped: int = 0
42
+ self.chunks_created: int = 0
43
+ self.chunks_reused: int = 0
44
+ self.total_vectors: int = 0
45
+ self.symbols_extracted: int = 0
46
+
47
+ def __repr__(self) -> str:
48
+ return (
49
+ f"IndexingResult(scanned={self.files_scanned}, "
50
+ f"indexed={self.files_indexed}, skipped={self.files_skipped}, "
51
+ f"chunks={self.chunks_created}, reused={self.chunks_reused}, "
52
+ f"vectors={self.total_vectors}, "
53
+ f"symbols={self.symbols_extracted})"
54
+ )
55
+
56
+
57
+ def _extract_symbols(
58
+ files_to_index: list[ScannedFile],
59
+ deleted_paths: list[str],
60
+ index_dir: Path,
61
+ force: bool,
62
+ ) -> tuple[SymbolRegistry, int]:
63
+ """Extract symbols from indexed files and update the registry."""
64
+ registry = SymbolRegistry() if force else SymbolRegistry.load(index_dir)
65
+ count = 0
66
+ for dp in deleted_paths:
67
+ registry.remove_file(dp)
68
+ for sf in files_to_index:
69
+ registry.remove_file(sf.relative_path)
70
+ try:
71
+ symbols = parse_file(sf.path)
72
+ entries = [
73
+ SymbolEntry(
74
+ name=sym.name,
75
+ kind=sym.kind,
76
+ file_path=sf.relative_path,
77
+ start_line=sym.start_line,
78
+ end_line=sym.end_line,
79
+ parent=sym.parent,
80
+ parameters=sym.parameters,
81
+ decorators=sym.decorators,
82
+ language=detect_language(str(sf.path)),
83
+ )
84
+ for sym in symbols
85
+ ]
86
+ registry.add_many(entries)
87
+ count += len(entries)
88
+ except Exception:
89
+ logger.debug("Symbol extraction failed for %s", sf.relative_path)
90
+ registry.save(index_dir)
91
+ return registry, count
92
+
93
+
94
+ def _compute_index_stats(
95
+ all_chunks: list[CodeChunk],
96
+ registry: SymbolRegistry,
97
+ result: IndexingResult,
98
+ config: "AppConfig",
99
+ dimension: int,
100
+ store_size: int,
101
+ indexing_start: float,
102
+ index_dir: Path,
103
+ ) -> None:
104
+ """Aggregate per-language metrics and persist index stats."""
105
+ indexing_end = time.time()
106
+
107
+ lang_files: dict[str, set[str]] = defaultdict(set)
108
+ lang_chunks: dict[str, int] = defaultdict(int)
109
+ lang_lines: dict[str, int] = defaultdict(int)
110
+ for chunk in all_chunks:
111
+ lang = chunk.language or "unknown"
112
+ lang_files[lang].add(chunk.file_path)
113
+ lang_chunks[lang] += 1
114
+ lang_lines[lang] += chunk.end_line - chunk.start_line + 1
115
+ lang_symbols: dict[str, int] = registry.language_summary()
116
+
117
+ coverage = [
118
+ LanguageCoverage(
119
+ language=lang,
120
+ files=len(files),
121
+ chunks=lang_chunks.get(lang, 0),
122
+ symbols=lang_symbols.get(lang, 0),
123
+ total_lines=lang_lines.get(lang, 0),
124
+ )
125
+ for lang, files in lang_files.items()
126
+ ]
127
+
128
+ total_chars = sum(len(c.content) for c in all_chunks)
129
+ stats = IndexStats(
130
+ total_files=result.files_indexed + result.files_skipped,
131
+ total_chunks=store_size,
132
+ total_symbols=registry.size,
133
+ total_vectors=store_size,
134
+ last_indexed_at=indexing_end,
135
+ indexing_duration_seconds=round(indexing_end - indexing_start, 3),
136
+ language_coverage=coverage,
137
+ avg_chunk_size=round(total_chars / len(all_chunks), 1) if all_chunks else 0.0,
138
+ embedding_model=config.embedding.model_name,
139
+ embedding_dimension=dimension,
140
+ )
141
+ stats.save(index_dir)
142
+
143
+
144
+ def run_indexing(
145
+ project_root: Path,
146
+ force: bool = False,
147
+ ) -> IndexingResult:
148
+ """Run the full indexing pipeline for a project.
149
+
150
+ Uses **chunk-level incremental indexing**: when a file changes, each
151
+ chunk is individually hashed and only chunks with new/changed content
152
+ are re-embedded. Unchanged chunks keep their existing vectors,
153
+ achieving high cache-hit ratios (typically 80-90% on incremental runs).
154
+
155
+ Args:
156
+ project_root: Root directory of the project.
157
+ force: If True, re-index all files regardless of hash cache.
158
+
159
+ Returns:
160
+ IndexingResult with statistics.
161
+ """
162
+ project_root = project_root.resolve()
163
+ config = load_config(project_root)
164
+ index_dir = AppConfig.index_dir(project_root)
165
+ index_dir.mkdir(parents=True, exist_ok=True)
166
+
167
+ indexing_start = time.time()
168
+ result = IndexingResult()
169
+
170
+ # Step 1: Scan repository
171
+ logger.info("Scanning repository: %s", project_root)
172
+ scanned_files = scan_repository(project_root, config.index)
173
+ result.files_scanned = len(scanned_files)
174
+ logger.info("Found %d indexable files.", result.files_scanned)
175
+
176
+ if not scanned_files:
177
+ return result
178
+
179
+ # Step 2: Load hash stores for incremental indexing
180
+ hash_store = HashStore.load(index_dir)
181
+ chunk_hash_store = ChunkHashStore.load(index_dir)
182
+ files_to_index: list[ScannedFile] = []
183
+ scanned_paths = {sf.relative_path for sf in scanned_files}
184
+
185
+ if force:
186
+ files_to_index = scanned_files
187
+ else:
188
+ for sf in scanned_files:
189
+ if hash_store.has_changed(sf.relative_path, sf.content_hash):
190
+ files_to_index.append(sf)
191
+ else:
192
+ result.files_skipped += 1
193
+
194
+ # Detect deleted files: tracked in hash_store but no longer on disk
195
+ deleted_paths: list[str] = []
196
+ if not force:
197
+ for tracked_path in list(hash_store._hashes.keys()):
198
+ if tracked_path not in scanned_paths:
199
+ deleted_paths.append(tracked_path)
200
+
201
+ logger.info(
202
+ "%d files to index (%d skipped, unchanged).",
203
+ len(files_to_index),
204
+ result.files_skipped,
205
+ )
206
+
207
+ # Step 3: Chunk all changed files
208
+ all_chunks: list[CodeChunk] = []
209
+ chunk_file_hashes: list[str] = [] # parallel array: hash for each chunk's file
210
+
211
+ for sf in files_to_index:
212
+ chunks = chunk_file(
213
+ sf.path,
214
+ chunk_size=config.embedding.chunk_size,
215
+ chunk_overlap=config.embedding.chunk_overlap,
216
+ )
217
+ for c in chunks:
218
+ all_chunks.append(c)
219
+ chunk_file_hashes.append(sf.content_hash)
220
+ result.files_indexed += 1
221
+
222
+ result.chunks_created = len(all_chunks)
223
+ logger.info("Created %d code chunks.", result.chunks_created)
224
+
225
+ if not all_chunks:
226
+ # Update hashes even if no chunks (e.g. empty files)
227
+ for sf in files_to_index:
228
+ hash_store.set(sf.relative_path, sf.content_hash)
229
+
230
+ # Still clean up deleted files from the vector store
231
+ if deleted_paths:
232
+ try:
233
+ store = VectorStore.load(index_dir)
234
+ for dp in deleted_paths:
235
+ full = str(project_root / dp)
236
+ store.remove_by_file(full)
237
+ hash_store.remove(dp)
238
+ chunk_hash_store.remove_by_file(full)
239
+ store.save(index_dir)
240
+ except FileNotFoundError:
241
+ pass
242
+
243
+ hash_store.save(index_dir)
244
+ chunk_hash_store.save(index_dir)
245
+ return result
246
+
247
+ # Step 4: Chunk-level delta — separate new/changed chunks from reusable ones
248
+ chunks_to_embed: list[CodeChunk] = []
249
+ chunks_to_embed_file_hashes: list[str] = []
250
+ reused_indices: list[int] = [] # indices into all_chunks that are unchanged
251
+
252
+ if force:
253
+ chunks_to_embed = all_chunks
254
+ chunks_to_embed_file_hashes = chunk_file_hashes
255
+ else:
256
+ for i, chunk in enumerate(all_chunks):
257
+ c_hash = compute_chunk_hash(chunk.content)
258
+ c_key = ChunkHashStore.chunk_key(
259
+ chunk.file_path, chunk.start_line, chunk.end_line,
260
+ )
261
+ if chunk_hash_store.has_changed(c_key, c_hash):
262
+ chunks_to_embed.append(chunk)
263
+ chunks_to_embed_file_hashes.append(chunk_file_hashes[i])
264
+ else:
265
+ reused_indices.append(i)
266
+
267
+ result.chunks_reused = len(reused_indices)
268
+ logger.info(
269
+ "Chunk-level delta: %d to embed, %d reused (cache hit %.0f%%).",
270
+ len(chunks_to_embed),
271
+ result.chunks_reused,
272
+ 100 * result.chunks_reused / len(all_chunks) if all_chunks else 0,
273
+ )
274
+
275
+ # Step 5: Generate embeddings only for changed chunks
276
+ new_embeddings: np.ndarray | None = None
277
+ if chunks_to_embed:
278
+ texts = [chunk.content for chunk in chunks_to_embed]
279
+ logger.info("Generating embeddings for %d chunks...", len(texts))
280
+ new_embeddings = generate_embeddings(
281
+ texts,
282
+ model_name=config.embedding.model_name,
283
+ show_progress=True,
284
+ )
285
+ logger.info("Embeddings generated. Shape: %s", new_embeddings.shape)
286
+
287
+ # Step 6: Load or create vector store and reconcile
288
+ if new_embeddings is not None:
289
+ dimension = new_embeddings.shape[1]
290
+ else:
291
+ dimension = get_embedding_dimension(config.embedding.model_name)
292
+
293
+ if force:
294
+ store = VectorStore(dimension)
295
+ cached_vectors: dict[str, np.ndarray] = {}
296
+ else:
297
+ try:
298
+ store = VectorStore.load(index_dir)
299
+ except FileNotFoundError:
300
+ store = VectorStore(dimension)
301
+
302
+ # Extract existing vectors for unchanged chunks BEFORE removing
303
+ cached_vectors = {}
304
+ for sf in files_to_index:
305
+ for meta, vec in store.get_vectors_for_file(str(sf.path)):
306
+ # Key by content hash so we can match reused chunks
307
+ cache_key = f"{meta.file_path}:{meta.start_line}:{meta.end_line}:{meta.content_hash}"
308
+ cached_vectors[cache_key] = vec
309
+
310
+ # Remove stale vectors for changed files before adding updated ones
311
+ for sf in files_to_index:
312
+ store.remove_by_file(str(sf.path))
313
+
314
+ # Remove vectors for deleted files
315
+ for dp in deleted_paths:
316
+ full = str(project_root / dp)
317
+ store.remove_by_file(full)
318
+ hash_store.remove(dp)
319
+ chunk_hash_store.remove_by_file(full)
320
+
321
+ # Step 7: Build final embeddings by combining cached + new vectors
322
+ # For reused chunks, look up their cached vectors instead of re-embedding.
323
+ # Build a lookup from changed-chunk index to its embedding
324
+ new_embed_map: dict[int, int] = {} # all_chunks idx -> new_embeddings idx
325
+ new_idx = 0
326
+ for i, chunk in enumerate(all_chunks):
327
+ if i not in set(reused_indices):
328
+ new_embed_map[i] = new_idx
329
+ new_idx += 1
330
+
331
+ all_embeddings_list: list[np.ndarray] = []
332
+ reembedded_count = 0
333
+ for i, chunk in enumerate(all_chunks):
334
+ if i in set(reused_indices) and not force:
335
+ # Try to reuse cached vector for this chunk
336
+ cache_key = f"{chunk.file_path}:{chunk.start_line}:{chunk.end_line}:{chunk_file_hashes[i]}"
337
+ cached_vec = cached_vectors.get(cache_key)
338
+ if cached_vec is not None:
339
+ all_embeddings_list.append(cached_vec)
340
+ continue
341
+ # Cache miss — chunk positions may have shifted; need to embed
342
+ reembedded_count += 1
343
+
344
+ if i in new_embed_map and new_embeddings is not None:
345
+ all_embeddings_list.append(new_embeddings[new_embed_map[i]])
346
+ else:
347
+ # Fallback: embed this single chunk (rare — only cache misses)
348
+ reembedded_count += 1
349
+ vec = generate_embeddings(
350
+ [chunk.content],
351
+ model_name=config.embedding.model_name,
352
+ show_progress=False,
353
+ )
354
+ all_embeddings_list.append(vec[0])
355
+
356
+ if reembedded_count > 0:
357
+ logger.info("Re-embedded %d chunks (cache miss due to position shift).", reembedded_count)
358
+
359
+ if all_embeddings_list:
360
+ all_embeddings = np.vstack([v.reshape(1, -1) for v in all_embeddings_list]).astype(np.float32)
361
+ else:
362
+ all_embeddings = np.empty((0, dimension), dtype=np.float32)
363
+
364
+ metadata_list = [
365
+ ChunkMetadata(
366
+ file_path=chunk.file_path,
367
+ start_line=chunk.start_line,
368
+ end_line=chunk.end_line,
369
+ chunk_index=chunk.chunk_index,
370
+ language=chunk.language,
371
+ content=chunk.content,
372
+ content_hash=chunk_file_hashes[i],
373
+ )
374
+ for i, chunk in enumerate(all_chunks)
375
+ ]
376
+
377
+ store.add(all_embeddings, metadata_list)
378
+ store.save(index_dir)
379
+
380
+ # Step 8: Update hash stores
381
+ for sf in files_to_index:
382
+ hash_store.set(sf.relative_path, sf.content_hash)
383
+ # Update chunk-level hashes
384
+ for chunk in all_chunks:
385
+ c_key = ChunkHashStore.chunk_key(
386
+ chunk.file_path, chunk.start_line, chunk.end_line,
387
+ )
388
+ chunk_hash_store.set(c_key, compute_chunk_hash(chunk.content))
389
+
390
+ hash_store.save(index_dir)
391
+ chunk_hash_store.save(index_dir)
392
+
393
+ result.total_vectors = store.size
394
+
395
+ # Step 9: Extract symbols and populate registry
396
+ registry, result.symbols_extracted = _extract_symbols(
397
+ files_to_index, deleted_paths, index_dir, force,
398
+ )
399
+
400
+ # Step 10: Update index manifest
401
+ manifest = IndexManifest.load(index_dir) or IndexManifest()
402
+ manifest.embedding_model = config.embedding.model_name
403
+ manifest.embedding_dimension = dimension
404
+ manifest.project_root = str(project_root)
405
+ manifest.total_files = result.files_indexed + result.files_skipped
406
+ manifest.total_chunks = store.size
407
+ manifest.total_symbols = registry.size
408
+ manifest.languages = sorted(set(
409
+ chunk.language for chunk in all_chunks if chunk.language != "unknown"
410
+ ))
411
+ manifest.touch()
412
+ manifest.save(index_dir)
413
+
414
+ # Step 11: Compute and persist index stats
415
+ _compute_index_stats(
416
+ all_chunks, registry, result, config,
417
+ dimension, store.size, indexing_start, index_dir,
418
+ )
419
+
420
+ logger.info("Indexing complete. %s", result)
421
+ return result
422
+
423
+
424
+ # ── Per-file incremental indexing (Phase 27) ─────────────────────────
425
+
426
+ def run_incremental_indexing(
427
+ project_root: Path,
428
+ changed_files: list[str],
429
+ deleted_files: list[str] | None = None,
430
+ ) -> IndexingResult:
431
+ """Re-index only specific changed/deleted files without a full repo scan.
432
+
433
+ This is the key performance optimisation for the daemon: instead of
434
+ scanning the entire repository on every file change, it processes only
435
+ the files that the watcher detected as created/modified/deleted.
436
+
437
+ Changed files are re-chunked, re-embedded, and their vectors replaced
438
+ in the FAISS store. Deleted files have their vectors, hashes, and
439
+ symbols removed.
440
+
441
+ Args:
442
+ project_root: Root directory of the project.
443
+ changed_files: Absolute paths of files that were created or modified.
444
+ deleted_files: Absolute paths of files that were deleted.
445
+
446
+ Returns:
447
+ IndexingResult with statistics for the incremental operation.
448
+ """
449
+ project_root = project_root.resolve()
450
+ config = load_config(project_root)
451
+ index_dir = AppConfig.index_dir(project_root)
452
+ index_dir.mkdir(parents=True, exist_ok=True)
453
+ deleted_files = deleted_files or []
454
+
455
+ indexing_start = time.time()
456
+ result = IndexingResult()
457
+
458
+ # Load existing stores
459
+ hash_store = HashStore.load(index_dir)
460
+ chunk_hash_store = ChunkHashStore.load(index_dir)
461
+
462
+ try:
463
+ store = VectorStore.load(index_dir)
464
+ except FileNotFoundError:
465
+ # No existing index — fall back to full indexing
466
+ logger.info("No existing index found; falling back to full indexing.")
467
+ return run_indexing(project_root, force=False)
468
+
469
+ dimension = store.dimension
470
+
471
+ # Step 1: Handle deleted files
472
+ deleted_rel: list[str] = []
473
+ for dp in deleted_files:
474
+ p = Path(dp)
475
+ if p.is_absolute():
476
+ try:
477
+ rel = str(p.relative_to(project_root))
478
+ except ValueError:
479
+ rel = str(p)
480
+ else:
481
+ rel = dp
482
+ store.remove_by_file(dp)
483
+ hash_store.remove(rel)
484
+ chunk_hash_store.remove_by_file(dp)
485
+ deleted_rel.append(rel)
486
+
487
+ # Step 2: Process changed files
488
+ scanned_files: list[ScannedFile] = []
489
+ for fp in changed_files:
490
+ p = Path(fp)
491
+ if not p.is_file():
492
+ continue
493
+ try:
494
+ rel = str(p.relative_to(project_root))
495
+ except ValueError:
496
+ rel = str(p)
497
+ content_hash = _safe_compute_hash(p)
498
+ if content_hash is None:
499
+ continue
500
+ scanned_files.append(ScannedFile(
501
+ path=p,
502
+ relative_path=rel,
503
+ extension=p.suffix,
504
+ size_bytes=p.stat().st_size,
505
+ content_hash=content_hash,
506
+ ))
507
+ result.files_scanned = len(scanned_files)
508
+
509
+ # Step 3: Filter to files that actually changed (hash check)
510
+ files_to_index: list[ScannedFile] = []
511
+ for sf in scanned_files:
512
+ if hash_store.has_changed(sf.relative_path, sf.content_hash):
513
+ files_to_index.append(sf)
514
+ else:
515
+ result.files_skipped += 1
516
+
517
+ if not files_to_index and not deleted_files:
518
+ logger.info("Incremental: nothing to update.")
519
+ return result
520
+
521
+ # Step 4: Chunk changed files
522
+ all_chunks: list[CodeChunk] = []
523
+ chunk_file_hashes: list[str] = []
524
+
525
+ for sf in files_to_index:
526
+ # Remove old vectors for this file before re-adding
527
+ store.remove_by_file(str(sf.path))
528
+ chunk_hash_store.remove_by_file(str(sf.path))
529
+
530
+ chunks = chunk_file(
531
+ sf.path,
532
+ chunk_size=config.embedding.chunk_size,
533
+ chunk_overlap=config.embedding.chunk_overlap,
534
+ )
535
+ for c in chunks:
536
+ all_chunks.append(c)
537
+ chunk_file_hashes.append(sf.content_hash)
538
+ result.files_indexed += 1
539
+
540
+ result.chunks_created = len(all_chunks)
541
+
542
+ # Step 5: Embed and add to store
543
+ if all_chunks:
544
+ texts = [chunk.content for chunk in all_chunks]
545
+ logger.info("Incremental: embedding %d chunks from %d files...",
546
+ len(texts), len(files_to_index))
547
+ embeddings = generate_embeddings(
548
+ texts,
549
+ model_name=config.embedding.model_name,
550
+ show_progress=False,
551
+ )
552
+
553
+ metadata_list = [
554
+ ChunkMetadata(
555
+ file_path=chunk.file_path,
556
+ start_line=chunk.start_line,
557
+ end_line=chunk.end_line,
558
+ chunk_index=chunk.chunk_index,
559
+ language=chunk.language,
560
+ content=chunk.content,
561
+ content_hash=chunk_file_hashes[i],
562
+ )
563
+ for i, chunk in enumerate(all_chunks)
564
+ ]
565
+
566
+ store.add(embeddings, metadata_list)
567
+
568
+ # Step 6: Persist stores
569
+ store.save(index_dir)
570
+
571
+ for sf in files_to_index:
572
+ hash_store.set(sf.relative_path, sf.content_hash)
573
+ for chunk in all_chunks:
574
+ c_key = ChunkHashStore.chunk_key(
575
+ chunk.file_path, chunk.start_line, chunk.end_line,
576
+ )
577
+ chunk_hash_store.set(c_key, compute_chunk_hash(chunk.content))
578
+
579
+ hash_store.save(index_dir)
580
+ chunk_hash_store.save(index_dir)
581
+ result.total_vectors = store.size
582
+
583
+ # Step 7: Update symbol registry for changed files only
584
+ registry = SymbolRegistry.load(index_dir)
585
+ sym_count = 0
586
+ for dp in deleted_rel:
587
+ registry.remove_file(dp)
588
+ for sf in files_to_index:
589
+ registry.remove_file(sf.relative_path)
590
+ try:
591
+ symbols = parse_file(sf.path)
592
+ entries = [
593
+ SymbolEntry(
594
+ name=sym.name,
595
+ kind=sym.kind,
596
+ file_path=sf.relative_path,
597
+ start_line=sym.start_line,
598
+ end_line=sym.end_line,
599
+ parent=sym.parent,
600
+ parameters=sym.parameters,
601
+ decorators=sym.decorators,
602
+ language=detect_language(str(sf.path)),
603
+ )
604
+ for sym in symbols
605
+ ]
606
+ registry.add_many(entries)
607
+ sym_count += len(entries)
608
+ except Exception:
609
+ logger.debug("Symbol extraction failed for %s", sf.relative_path)
610
+ registry.save(index_dir)
611
+ result.symbols_extracted = sym_count
612
+
613
+ # Step 8: Update index manifest
614
+ manifest = IndexManifest.load(index_dir) or IndexManifest()
615
+ manifest.total_chunks = store.size
616
+ manifest.total_symbols = registry.size
617
+ manifest.touch()
618
+ manifest.save(index_dir)
619
+
620
+ logger.info("Incremental indexing complete. %s", result)
621
+ return result
622
+
623
+
624
+ def _safe_compute_hash(path: Path) -> str | None:
625
+ """Compute file hash, returning None on error."""
626
+ try:
627
+ from semantic_code_intelligence.indexing.scanner import compute_file_hash
628
+ return compute_file_hash(path)
629
+ except (OSError, PermissionError):
630
+ return None