codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,160 @@
1
+ """Query history — cross-session intelligence and search analytics.
2
+
3
+ Records past search queries, their result counts and scores, enabling
4
+ popular-symbol tracking, query suggestions, and analytics on what
5
+ developers search for most.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import time
12
+ from dataclasses import asdict, dataclass, field
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ HISTORY_FILE = "query_history.json"
17
+ MAX_HISTORY = 500
18
+
19
+
20
+ @dataclass
21
+ class QueryRecord:
22
+ """A single recorded search query."""
23
+
24
+ query: str
25
+ timestamp: float = 0.0
26
+ result_count: int = 0
27
+ top_score: float = 0.0
28
+ languages: list[str] = field(default_factory=list)
29
+ top_files: list[str] = field(default_factory=list)
30
+
31
+ def to_dict(self) -> dict[str, Any]:
32
+ return asdict(self)
33
+
34
+ @classmethod
35
+ def from_dict(cls, data: dict[str, Any]) -> QueryRecord:
36
+ known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
37
+ return cls(**{k: v for k, v in data.items() if k in known})
38
+
39
+
40
+ class QueryHistory:
41
+ """Persistent query history with analytics.
42
+
43
+ Stores the last *max_entries* queries and provides aggregate
44
+ statistics for popular searches, symbols, and files.
45
+ """
46
+
47
+ def __init__(self, max_entries: int = MAX_HISTORY) -> None:
48
+ self._records: list[QueryRecord] = []
49
+ self._max_entries = max_entries
50
+
51
+ # ------------------------------------------------------------------
52
+ # Mutation
53
+ # ------------------------------------------------------------------
54
+
55
+ def record(
56
+ self,
57
+ query: str,
58
+ result_count: int = 0,
59
+ top_score: float = 0.0,
60
+ languages: list[str] | None = None,
61
+ top_files: list[str] | None = None,
62
+ ) -> QueryRecord:
63
+ """Record a search query."""
64
+ entry = QueryRecord(
65
+ query=query,
66
+ timestamp=time.time(),
67
+ result_count=result_count,
68
+ top_score=top_score,
69
+ languages=languages or [],
70
+ top_files=top_files or [],
71
+ )
72
+ self._records.append(entry)
73
+ # Evict oldest when exceeding max
74
+ while len(self._records) > self._max_entries:
75
+ self._records.pop(0)
76
+ return entry
77
+
78
+ def clear(self) -> None:
79
+ """Remove all history."""
80
+ self._records.clear()
81
+
82
+ # ------------------------------------------------------------------
83
+ # Queries
84
+ # ------------------------------------------------------------------
85
+
86
+ @property
87
+ def size(self) -> int:
88
+ return len(self._records)
89
+
90
+ @property
91
+ def records(self) -> list[QueryRecord]:
92
+ """Return all records (newest last)."""
93
+ return list(self._records)
94
+
95
+ def recent(self, n: int = 10) -> list[QueryRecord]:
96
+ """Return the *n* most recent queries."""
97
+ return list(self._records[-n:])
98
+
99
+ def popular_queries(self, n: int = 10) -> list[tuple[str, int]]:
100
+ """Return the *n* most frequent query strings with counts."""
101
+ counts: dict[str, int] = {}
102
+ for r in self._records:
103
+ counts[r.query] = counts.get(r.query, 0) + 1
104
+ ranked = sorted(counts.items(), key=lambda x: x[1], reverse=True)
105
+ return ranked[:n]
106
+
107
+ def popular_files(self, n: int = 10) -> list[tuple[str, int]]:
108
+ """Return the *n* most frequently appearing files in results."""
109
+ counts: dict[str, int] = {}
110
+ for r in self._records:
111
+ for f in r.top_files:
112
+ counts[f] = counts.get(f, 0) + 1
113
+ ranked = sorted(counts.items(), key=lambda x: x[1], reverse=True)
114
+ return ranked[:n]
115
+
116
+ def avg_result_count(self) -> float:
117
+ """Return the average number of results per query."""
118
+ if not self._records:
119
+ return 0.0
120
+ return sum(r.result_count for r in self._records) / len(self._records)
121
+
122
+ def to_dict(self) -> dict[str, Any]:
123
+ return {
124
+ "max_entries": self._max_entries,
125
+ "records": [r.to_dict() for r in self._records],
126
+ }
127
+
128
+ def __repr__(self) -> str:
129
+ return f"QueryHistory(records={len(self._records)})"
130
+
131
+ # ------------------------------------------------------------------
132
+ # Persistence
133
+ # ------------------------------------------------------------------
134
+
135
+ def save(self, directory: str | Path) -> None:
136
+ """Write history to disk."""
137
+ path = Path(directory)
138
+ path.mkdir(parents=True, exist_ok=True)
139
+ (path / HISTORY_FILE).write_text(
140
+ json.dumps(self.to_dict(), indent=2, ensure_ascii=False),
141
+ encoding="utf-8",
142
+ )
143
+
144
+ @classmethod
145
+ def load(cls, directory: str | Path) -> QueryHistory:
146
+ """Load history from disk. Returns empty history if absent."""
147
+ history = cls()
148
+ path = Path(directory) / HISTORY_FILE
149
+ if not path.exists():
150
+ return history
151
+ try:
152
+ data = json.loads(path.read_text(encoding="utf-8"))
153
+ if isinstance(data, dict):
154
+ history._max_entries = data.get("max_entries", MAX_HISTORY)
155
+ for item in data.get("records", []):
156
+ if isinstance(item, dict):
157
+ history._records.append(QueryRecord.from_dict(item))
158
+ except (json.JSONDecodeError, OSError):
159
+ pass
160
+ return history
@@ -0,0 +1,209 @@
1
+ """Symbol registry — persistent, queryable directory of code symbols.
2
+
3
+ Stores every function, class, and method extracted from the codebase,
4
+ enabling fast lookups by name, kind, file, or parent class without
5
+ re-parsing source files.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from dataclasses import asdict, dataclass, field
12
+ from pathlib import Path
13
+ from typing import Any, Iterator
14
+
15
+ REGISTRY_FILE = "symbol_registry.json"
16
+
17
+
18
+ @dataclass
19
+ class SymbolEntry:
20
+ """A single symbol record in the registry."""
21
+
22
+ name: str
23
+ kind: str # "function", "class", "method", "import"
24
+ file_path: str
25
+ start_line: int
26
+ end_line: int
27
+ parent: str | None = None
28
+ parameters: list[str] = field(default_factory=list)
29
+ decorators: list[str] = field(default_factory=list)
30
+ language: str = ""
31
+
32
+ def to_dict(self) -> dict[str, Any]:
33
+ return asdict(self)
34
+
35
+ @classmethod
36
+ def from_dict(cls, data: dict[str, Any]) -> SymbolEntry:
37
+ known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
38
+ filtered = {k: v for k, v in data.items() if k in known}
39
+ return cls(**filtered)
40
+
41
+ @property
42
+ def qualified_name(self) -> str:
43
+ """Return ``Parent.name`` for methods, else just ``name``."""
44
+ if self.parent:
45
+ return f"{self.parent}.{self.name}"
46
+ return self.name
47
+
48
+
49
+ class SymbolRegistry:
50
+ """Persistent symbol directory backed by JSON.
51
+
52
+ Supports incremental updates (clear symbols for a file, then re-add),
53
+ multi-criteria lookups, and disk persistence.
54
+ """
55
+
56
+ def __init__(self) -> None:
57
+ self._symbols: list[SymbolEntry] = []
58
+ # Secondary index: file_path → list of indices into _symbols
59
+ self._by_file: dict[str, list[int]] = {}
60
+
61
+ # ------------------------------------------------------------------
62
+ # Mutation
63
+ # ------------------------------------------------------------------
64
+
65
+ def add(self, entry: SymbolEntry) -> None:
66
+ """Add a symbol entry to the registry."""
67
+ idx = len(self._symbols)
68
+ self._symbols.append(entry)
69
+ self._by_file.setdefault(entry.file_path, []).append(idx)
70
+
71
+ def add_many(self, entries: list[SymbolEntry]) -> None:
72
+ """Bulk-add symbol entries."""
73
+ for entry in entries:
74
+ self.add(entry)
75
+
76
+ def remove_file(self, file_path: str) -> int:
77
+ """Remove all symbols belonging to *file_path*.
78
+
79
+ Returns the number of entries removed.
80
+ """
81
+ indices = self._by_file.pop(file_path, [])
82
+ if not indices:
83
+ return 0
84
+ removed = len(indices)
85
+ keep = set(range(len(self._symbols))) - set(indices)
86
+ self._symbols = [self._symbols[i] for i in sorted(keep)]
87
+ self._rebuild_file_index()
88
+ return removed
89
+
90
+ def clear(self) -> None:
91
+ """Remove all symbols."""
92
+ self._symbols.clear()
93
+ self._by_file.clear()
94
+
95
+ # ------------------------------------------------------------------
96
+ # Queries
97
+ # ------------------------------------------------------------------
98
+
99
+ @property
100
+ def size(self) -> int:
101
+ return len(self._symbols)
102
+
103
+ @property
104
+ def files(self) -> list[str]:
105
+ """Return all tracked file paths."""
106
+ return list(self._by_file.keys())
107
+
108
+ def find_by_name(self, name: str) -> list[SymbolEntry]:
109
+ """Find all symbols with the exact *name*."""
110
+ return [s for s in self._symbols if s.name == name]
111
+
112
+ def find_by_kind(self, kind: str) -> list[SymbolEntry]:
113
+ """Find all symbols of a given *kind* (function, class, method, import)."""
114
+ return [s for s in self._symbols if s.kind == kind]
115
+
116
+ def find_by_file(self, file_path: str) -> list[SymbolEntry]:
117
+ """Return all symbols in the given file."""
118
+ indices = self._by_file.get(file_path, [])
119
+ return [self._symbols[i] for i in indices]
120
+
121
+ def find(
122
+ self,
123
+ name: str | None = None,
124
+ kind: str | None = None,
125
+ file_path: str | None = None,
126
+ parent: str | None = None,
127
+ language: str | None = None,
128
+ ) -> list[SymbolEntry]:
129
+ """Multi-criteria symbol lookup. ``None`` fields are not filtered."""
130
+ results: list[SymbolEntry] = []
131
+ for sym in self._iter_candidates(file_path):
132
+ if name is not None and sym.name != name:
133
+ continue
134
+ if kind is not None and sym.kind != kind:
135
+ continue
136
+ if parent is not None and sym.parent != parent:
137
+ continue
138
+ if language is not None and sym.language != language:
139
+ continue
140
+ results.append(sym)
141
+ return results
142
+
143
+ def search_name(self, substring: str) -> list[SymbolEntry]:
144
+ """Return symbols whose name contains *substring* (case-insensitive)."""
145
+ lower = substring.lower()
146
+ return [s for s in self._symbols if lower in s.name.lower()]
147
+
148
+ def language_summary(self) -> dict[str, int]:
149
+ """Return a count of symbols per language."""
150
+ counts: dict[str, int] = {}
151
+ for s in self._symbols:
152
+ lang = s.language or "unknown"
153
+ counts[lang] = counts.get(lang, 0) + 1
154
+ return counts
155
+
156
+ def kind_summary(self) -> dict[str, int]:
157
+ """Return a count of symbols per kind."""
158
+ counts: dict[str, int] = {}
159
+ for s in self._symbols:
160
+ counts[s.kind] = counts.get(s.kind, 0) + 1
161
+ return counts
162
+
163
+ # ------------------------------------------------------------------
164
+ # Persistence
165
+ # ------------------------------------------------------------------
166
+
167
+ def save(self, directory: str | Path) -> None:
168
+ """Write registry to disk as JSON."""
169
+ path = Path(directory)
170
+ path.mkdir(parents=True, exist_ok=True)
171
+ data = [s.to_dict() for s in self._symbols]
172
+ (path / REGISTRY_FILE).write_text(
173
+ json.dumps(data, ensure_ascii=False, indent=2),
174
+ encoding="utf-8",
175
+ )
176
+
177
+ @classmethod
178
+ def load(cls, directory: str | Path) -> SymbolRegistry:
179
+ """Load registry from disk. Returns empty registry if absent."""
180
+ registry = cls()
181
+ path = Path(directory) / REGISTRY_FILE
182
+ if not path.exists():
183
+ return registry
184
+ try:
185
+ raw = json.loads(path.read_text(encoding="utf-8"))
186
+ if isinstance(raw, list):
187
+ for item in raw:
188
+ if isinstance(item, dict):
189
+ registry.add(SymbolEntry.from_dict(item))
190
+ except (json.JSONDecodeError, OSError):
191
+ pass
192
+ return registry
193
+
194
+ # ------------------------------------------------------------------
195
+ # Internal
196
+ # ------------------------------------------------------------------
197
+
198
+ def _rebuild_file_index(self) -> None:
199
+ self._by_file.clear()
200
+ for i, sym in enumerate(self._symbols):
201
+ self._by_file.setdefault(sym.file_path, []).append(i)
202
+
203
+ def _iter_candidates(self, file_path: str | None) -> Iterator[SymbolEntry]:
204
+ if file_path is not None:
205
+ indices = self._by_file.get(file_path, [])
206
+ for i in indices:
207
+ yield self._symbols[i]
208
+ else:
209
+ yield from self._symbols
@@ -0,0 +1,297 @@
1
+ """Vector store — FAISS-based storage and retrieval of code embeddings.
2
+
3
+ Supports two index modes:
4
+ - **Flat** (default): Brute-force exact search — best for <50 k vectors.
5
+ - **IVF**: Inverted-file approximate search — faster for large repos (>50 k).
6
+ Enabled automatically when the vector count crosses *IVF_THRESHOLD* or by
7
+ passing ``use_ivf=True`` to the constructor.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from collections import defaultdict
14
+ from dataclasses import asdict, dataclass
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ import faiss
19
+ import numpy as np
20
+
21
+ from semantic_code_intelligence.utils.logging import get_logger
22
+
23
+ logger = get_logger("storage")
24
+
25
+ # If the store has more vectors than this, it can benefit from IVF.
26
+ IVF_THRESHOLD = 50_000
27
+ IVF_NLIST = 100 # number of Voronoi cells
28
+ IVF_NPROBE = 10 # cells probed at search time
29
+
30
+
31
+ @dataclass
32
+ class ChunkMetadata:
33
+ """Metadata associated with a stored code chunk."""
34
+
35
+ file_path: str
36
+ start_line: int
37
+ end_line: int
38
+ chunk_index: int
39
+ language: str
40
+ content: str
41
+ content_hash: str = ""
42
+
43
+
44
+ class VectorStore:
45
+ """FAISS-backed vector store for code chunk embeddings.
46
+
47
+ Maintains a FAISS index and parallel metadata list.
48
+ Supports save/load to disk for persistence.
49
+
50
+ When *use_ivf* is ``True`` (or the vector count exceeds *IVF_THRESHOLD*),
51
+ the store transparently migrates to a ``faiss.IndexIVFFlat`` for faster
52
+ approximate nearest-neighbour search.
53
+ """
54
+
55
+ def __init__(self, dimension: int, *, use_ivf: bool = False) -> None:
56
+ self.dimension = dimension
57
+ self._use_ivf = use_ivf
58
+ if use_ivf:
59
+ quantizer = faiss.IndexFlatIP(dimension)
60
+ self.index = faiss.IndexIVFFlat(quantizer, dimension, IVF_NLIST, faiss.METRIC_INNER_PRODUCT)
61
+ self.index.nprobe = IVF_NPROBE
62
+ self._ivf_trained = False
63
+ else:
64
+ self.index = faiss.IndexFlatIP(dimension)
65
+ self._ivf_trained = True # flat doesn't need training
66
+ self.metadata: list[ChunkMetadata] = []
67
+ # Reverse index: file_path -> set of vector indices for O(1) lookup
68
+ self._file_index: dict[str, set[int]] = defaultdict(set)
69
+
70
+ @property
71
+ def size(self) -> int:
72
+ """Return the number of vectors stored."""
73
+ return int(self.index.ntotal)
74
+
75
+ def add(
76
+ self,
77
+ embeddings: np.ndarray,
78
+ metadata_list: list[ChunkMetadata],
79
+ ) -> None:
80
+ """Add embeddings and their metadata to the store.
81
+
82
+ If the store uses an IVF index that hasn't been trained yet, the first
83
+ batch of vectors is used to train it. If the store is in flat mode and
84
+ the total count crosses *IVF_THRESHOLD*, it auto-upgrades to IVF.
85
+ """
86
+ if len(embeddings) != len(metadata_list):
87
+ raise ValueError(
88
+ f"Embedding count ({len(embeddings)}) != metadata count ({len(metadata_list)})"
89
+ )
90
+ if len(embeddings) == 0:
91
+ return
92
+
93
+ embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
94
+
95
+ # Train IVF index on first batch if needed
96
+ if self._use_ivf and not self._ivf_trained:
97
+ if len(embeddings) >= IVF_NLIST:
98
+ self.index.train(embeddings)
99
+ self._ivf_trained = True
100
+ else:
101
+ # Not enough vectors to train — fall back to flat temporarily
102
+ logger.debug("Not enough vectors to train IVF (%d < %d), using flat.", len(embeddings), IVF_NLIST)
103
+ self.index = faiss.IndexFlatIP(self.dimension)
104
+ self._use_ivf = False
105
+ self._ivf_trained = True
106
+
107
+ # Update file index before adding
108
+ base = len(self.metadata)
109
+ for i, meta in enumerate(metadata_list):
110
+ self._file_index[meta.file_path].add(base + i)
111
+
112
+ self.index.add(embeddings)
113
+ self.metadata.extend(metadata_list)
114
+
115
+ # Auto-upgrade from flat to IVF when threshold is crossed
116
+ if not self._use_ivf and self.size >= IVF_THRESHOLD:
117
+ self._upgrade_to_ivf()
118
+
119
+ def search(
120
+ self,
121
+ query_embedding: np.ndarray,
122
+ top_k: int = 10,
123
+ ) -> list[tuple[ChunkMetadata, float]]:
124
+ """Search for the most similar embeddings.
125
+
126
+ Args:
127
+ query_embedding: Query vector of shape (dimension,) or (1, dimension).
128
+ top_k: Number of top results to return.
129
+
130
+ Returns:
131
+ List of (metadata, score) tuples, ordered by decreasing similarity.
132
+ """
133
+ if self.size == 0:
134
+ return []
135
+
136
+ query = np.ascontiguousarray(
137
+ query_embedding.reshape(1, -1), dtype=np.float32
138
+ )
139
+ k = min(top_k, self.size)
140
+ scores, indices = self.index.search(query, k)
141
+
142
+ results: list[tuple[ChunkMetadata, float]] = []
143
+ for score, idx in zip(scores[0], indices[0]):
144
+ if idx < 0:
145
+ continue
146
+ results.append((self.metadata[idx], float(score)))
147
+ return results
148
+
149
+ def save(self, directory: Path) -> None:
150
+ """Persist the vector store to disk.
151
+
152
+ Saves the FAISS index and metadata as separate files.
153
+
154
+ Args:
155
+ directory: Directory to save into.
156
+ """
157
+ directory = Path(directory)
158
+ directory.mkdir(parents=True, exist_ok=True)
159
+
160
+ index_path = directory / "vectors.faiss"
161
+ meta_path = directory / "metadata.json"
162
+
163
+ faiss.write_index(self.index, str(index_path))
164
+
165
+ meta_dicts = [asdict(m) for m in self.metadata]
166
+ meta_path.write_text(
167
+ json.dumps(meta_dicts, ensure_ascii=False),
168
+ encoding="utf-8",
169
+ )
170
+ logger.info("Saved %d vectors to %s", self.size, directory)
171
+
172
+ @classmethod
173
+ def load(cls, directory: Path) -> "VectorStore":
174
+ """Load a vector store from disk.
175
+
176
+ Args:
177
+ directory: Directory containing vectors.faiss and metadata.json.
178
+
179
+ Returns:
180
+ A populated VectorStore instance.
181
+
182
+ Raises:
183
+ FileNotFoundError: If the required files don't exist.
184
+ """
185
+ directory = Path(directory)
186
+ index_path = directory / "vectors.faiss"
187
+ meta_path = directory / "metadata.json"
188
+
189
+ if not index_path.exists() or not meta_path.exists():
190
+ raise FileNotFoundError(f"No vector store found in {directory}")
191
+
192
+ index = faiss.read_index(str(index_path))
193
+ dimension = index.d
194
+
195
+ meta_dicts = json.loads(meta_path.read_text(encoding="utf-8"))
196
+ metadata = [ChunkMetadata(**m) for m in meta_dicts]
197
+
198
+ store = cls(dimension)
199
+ store.index = index
200
+ store.metadata = metadata
201
+ # Rebuild file index from loaded metadata
202
+ for i, m in enumerate(metadata):
203
+ store._file_index[m.file_path].add(i)
204
+ logger.info("Loaded %d vectors from %s", store.size, directory)
205
+ return store
206
+
207
+ def remove_by_file(self, file_path: str) -> int:
208
+ """Remove all vectors whose metadata references *file_path*.
209
+
210
+ Uses the file index for O(1) lookup and batch vector reconstruction.
211
+
212
+ Args:
213
+ file_path: The ``file_path`` field to match against.
214
+
215
+ Returns:
216
+ Number of vectors removed.
217
+ """
218
+ remove_set = self._file_index.get(file_path)
219
+ if not remove_set:
220
+ return 0
221
+
222
+ removed = len(remove_set)
223
+ keep_indices = [
224
+ i for i in range(len(self.metadata)) if i not in remove_set
225
+ ]
226
+
227
+ if keep_indices:
228
+ # Batch reconstruct all kept vectors at once (no Python loop)
229
+ kept_vectors = np.empty(
230
+ (len(keep_indices), self.dimension), dtype=np.float32,
231
+ )
232
+ for j, idx in enumerate(keep_indices):
233
+ self.index.reconstruct(idx, kept_vectors[j])
234
+ kept_meta = [self.metadata[i] for i in keep_indices]
235
+ else:
236
+ kept_vectors = np.empty((0, self.dimension), dtype=np.float32)
237
+ kept_meta = []
238
+
239
+ self.index.reset()
240
+ if len(kept_vectors) > 0:
241
+ self.index.add(np.ascontiguousarray(kept_vectors))
242
+ self.metadata = kept_meta
243
+
244
+ # Rebuild file index
245
+ self._file_index.clear()
246
+ for i, m in enumerate(self.metadata):
247
+ self._file_index[m.file_path].add(i)
248
+
249
+ logger.debug("Removed %d vectors for %s", removed, file_path)
250
+ return removed
251
+
252
+ def get_vectors_for_file(self, file_path: str) -> list[tuple[ChunkMetadata, np.ndarray]]:
253
+ """Return metadata and vectors for all chunks belonging to a file.
254
+
255
+ Used by incremental indexing to preserve vectors for unchanged chunks
256
+ before removing the file's entries from the store.
257
+
258
+ Returns:
259
+ List of (metadata, vector) pairs.
260
+ """
261
+ indices = self._file_index.get(file_path)
262
+ if not indices:
263
+ return []
264
+ result: list[tuple[ChunkMetadata, np.ndarray]] = []
265
+ for idx in sorted(indices):
266
+ vec = np.empty(self.dimension, dtype=np.float32)
267
+ self.index.reconstruct(idx, vec)
268
+ result.append((self.metadata[idx], vec))
269
+ return result
270
+
271
+ def clear(self) -> None:
272
+ """Remove all vectors and metadata."""
273
+ self.index.reset()
274
+ self.metadata.clear()
275
+ self._file_index.clear()
276
+
277
+ # ------------------------------------------------------------------
278
+ # IVF helpers
279
+ # ------------------------------------------------------------------
280
+
281
+ def _upgrade_to_ivf(self) -> None:
282
+ """Migrate an in-memory flat index to IVF for faster search."""
283
+ n = self.size
284
+ if n < IVF_NLIST:
285
+ return # not enough vectors
286
+ logger.info("Auto-upgrading index to IVF (%d vectors).", n)
287
+ all_vecs = np.vstack(
288
+ [self.index.reconstruct(i).reshape(1, -1) for i in range(n)]
289
+ ).astype(np.float32)
290
+ quantizer = faiss.IndexFlatIP(self.dimension)
291
+ ivf = faiss.IndexIVFFlat(quantizer, self.dimension, IVF_NLIST, faiss.METRIC_INNER_PRODUCT)
292
+ ivf.nprobe = IVF_NPROBE
293
+ ivf.train(all_vecs)
294
+ ivf.add(all_vecs)
295
+ self.index = ivf
296
+ self._use_ivf = True
297
+ self._ivf_trained = True
File without changes