codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,311 @@
1
+ """Keyword and regex search engine — BM25 scoring + regex matching.
2
+
3
+ Provides grep-compatible text search and BM25-ranked keyword search
4
+ over indexed code chunks, without requiring external dependencies.
5
+ Supports persistent BM25 index serialization for fast startup.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import math
12
+ import re
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from semantic_code_intelligence.storage.vector_store import ChunkMetadata, VectorStore
18
+ from semantic_code_intelligence.utils.logging import get_logger
19
+
20
+ logger = get_logger("search.keyword")
21
+
22
+
23
+ @dataclass
24
+ class KeywordResult:
25
+ """A single keyword/regex search result."""
26
+
27
+ file_path: str
28
+ start_line: int
29
+ end_line: int
30
+ language: str
31
+ content: str
32
+ score: float
33
+ chunk_index: int
34
+ match_count: int
35
+ matched_lines: list[int]
36
+
37
+ def to_dict(self) -> dict[str, Any]:
38
+ return {
39
+ "file_path": self.file_path,
40
+ "start_line": self.start_line,
41
+ "end_line": self.end_line,
42
+ "language": self.language,
43
+ "content": self.content,
44
+ "score": round(self.score, 4),
45
+ "chunk_index": self.chunk_index,
46
+ "match_count": self.match_count,
47
+ "matched_lines": self.matched_lines,
48
+ }
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # BM25 Scorer
53
+ # ---------------------------------------------------------------------------
54
+
55
+
56
+ def _tokenize(text: str) -> list[str]:
57
+ """Split text into tokens by camelCase boundaries, underscores, and whitespace."""
58
+ # First split camelCase: "getValue" -> ["get", "Value"]
59
+ # Then split on underscores, digits separated
60
+ parts = re.findall(r"[a-z]+|[A-Z][a-z]*|[0-9]+", text)
61
+ return parts
62
+
63
+
64
+ def _lower_tokens(text: str) -> list[str]:
65
+ return [t.lower() for t in _tokenize(text)]
66
+
67
+
68
+ class BM25Index:
69
+ """A lightweight in-memory BM25 index over chunk metadata.
70
+
71
+ Built lazily from a VectorStore's metadata list so we can share
72
+ the same stored chunks for both semantic and keyword search.
73
+ """
74
+
75
+ k1: float = 1.5
76
+ b: float = 0.75
77
+
78
+ def __init__(self, metadata: list[ChunkMetadata]) -> None:
79
+ self.metadata = metadata
80
+ self.n = len(metadata)
81
+ self.doc_tokens: list[list[str]] = []
82
+ self.doc_lengths: list[int] = []
83
+ self.avgdl: float = 0.0
84
+ # term -> {doc_idx: term_freq}
85
+ self.inverted: dict[str, dict[int, int]] = {}
86
+ self._build()
87
+
88
+ def _build(self) -> None:
89
+ total_len = 0
90
+ for idx, meta in enumerate(self.metadata):
91
+ tokens = _lower_tokens(meta.content)
92
+ self.doc_tokens.append(tokens)
93
+ self.doc_lengths.append(len(tokens))
94
+ total_len += len(tokens)
95
+ seen: dict[str, int] = {}
96
+ for tok in tokens:
97
+ seen[tok] = seen.get(tok, 0) + 1
98
+ for tok, freq in seen.items():
99
+ if tok not in self.inverted:
100
+ self.inverted[tok] = {}
101
+ self.inverted[tok][idx] = freq
102
+ self.avgdl = total_len / self.n if self.n else 1.0
103
+
104
+ def search(self, query: str, top_k: int = 10) -> list[tuple[int, float]]:
105
+ """Return (doc_index, bm25_score) pairs sorted descending."""
106
+ query_tokens = _lower_tokens(query)
107
+ if not query_tokens:
108
+ return []
109
+
110
+ scores: dict[int, float] = {}
111
+ for token in set(query_tokens):
112
+ postings = self.inverted.get(token)
113
+ if not postings:
114
+ continue
115
+ df = len(postings)
116
+ idf = math.log((self.n - df + 0.5) / (df + 0.5) + 1.0)
117
+ for doc_idx, tf in postings.items():
118
+ dl = self.doc_lengths[doc_idx]
119
+ numerator = tf * (self.k1 + 1)
120
+ denominator = tf + self.k1 * (1 - self.b + self.b * dl / self.avgdl)
121
+ scores[doc_idx] = scores.get(doc_idx, 0.0) + idf * numerator / denominator
122
+
123
+ ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
124
+ return ranked[:top_k]
125
+
126
+ def save(self, directory: Path) -> None:
127
+ """Persist BM25 index to disk for fast reload.
128
+
129
+ Saves inverted index, doc lengths, and stats as JSON.
130
+ """
131
+ directory = Path(directory)
132
+ bm25_path = directory / "bm25_index.json"
133
+ data = {
134
+ "n": self.n,
135
+ "avgdl": self.avgdl,
136
+ "doc_lengths": self.doc_lengths,
137
+ # Convert int keys to strings for JSON
138
+ "inverted": {
139
+ term: {str(k): v for k, v in postings.items()}
140
+ for term, postings in self.inverted.items()
141
+ },
142
+ }
143
+ bm25_path.write_text(
144
+ json.dumps(data, ensure_ascii=False),
145
+ encoding="utf-8",
146
+ )
147
+ logger.debug("Saved BM25 index (%d docs, %d terms) to %s",
148
+ self.n, len(self.inverted), directory)
149
+
150
+ @classmethod
151
+ def load(cls, directory: Path, metadata: list[ChunkMetadata]) -> "BM25Index | None":
152
+ """Load a persisted BM25 index if available and valid.
153
+
154
+ Returns None if the file doesn't exist or the doc count doesn't
155
+ match (indicating the FAISS index has changed).
156
+ """
157
+ bm25_path = Path(directory) / "bm25_index.json"
158
+ if not bm25_path.exists():
159
+ return None
160
+ try:
161
+ data = json.loads(bm25_path.read_text(encoding="utf-8"))
162
+ if data["n"] != len(metadata):
163
+ logger.debug("BM25 cache stale (%d vs %d docs), rebuilding.",
164
+ data["n"], len(metadata))
165
+ return None
166
+ idx = cls.__new__(cls)
167
+ idx.metadata = metadata
168
+ idx.n = data["n"]
169
+ idx.avgdl = data["avgdl"]
170
+ idx.doc_lengths = data["doc_lengths"]
171
+ idx.doc_tokens = [] # not needed for search
172
+ idx.inverted = {
173
+ term: {int(k): v for k, v in postings.items()}
174
+ for term, postings in data["inverted"].items()
175
+ }
176
+ logger.debug("Loaded BM25 index from disk (%d docs).", idx.n)
177
+ return idx
178
+ except (json.JSONDecodeError, KeyError, TypeError):
179
+ logger.debug("BM25 cache corrupt, rebuilding.")
180
+ return None
181
+
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # Public API
185
+ # ---------------------------------------------------------------------------
186
+
187
+ _bm25_cache: dict[str, BM25Index] = {}
188
+
189
+
190
+ def _get_bm25(index_dir: Path, store: VectorStore) -> BM25Index:
191
+ """Get or build a BM25 index for the given vector store.
192
+
193
+ Checks (in order): in-memory cache, disk cache, then builds fresh.
194
+ Persists newly built indexes to disk for faster future loads.
195
+ """
196
+ cache_key = str(index_dir)
197
+ cached = _bm25_cache.get(cache_key)
198
+ if cached is not None and cached.n == store.size:
199
+ return cached
200
+
201
+ # Try loading from disk
202
+ loaded = BM25Index.load(index_dir, store.metadata)
203
+ if loaded is not None:
204
+ _bm25_cache[cache_key] = loaded
205
+ return loaded
206
+
207
+ # Build fresh and persist
208
+ logger.debug("Building BM25 index over %d chunks.", store.size)
209
+ idx = BM25Index(store.metadata)
210
+ idx.save(index_dir)
211
+ _bm25_cache[cache_key] = idx
212
+ return idx
213
+
214
+
215
+ def keyword_search(
216
+ query: str,
217
+ store: VectorStore,
218
+ index_dir: Path,
219
+ top_k: int = 10,
220
+ threshold: float = 0.0,
221
+ ) -> list[KeywordResult]:
222
+ """BM25-ranked keyword search over indexed chunks.
223
+
224
+ Args:
225
+ query: The search query (natural language or keywords).
226
+ store: Loaded VectorStore with metadata.
227
+ index_dir: Path to index directory (for caching).
228
+ top_k: Max results.
229
+ threshold: Minimum BM25 score.
230
+
231
+ Returns:
232
+ Sorted list of KeywordResult.
233
+ """
234
+ if store.size == 0:
235
+ return []
236
+
237
+ bm25 = _get_bm25(index_dir, store)
238
+ hits = bm25.search(query, top_k=top_k)
239
+
240
+ results: list[KeywordResult] = []
241
+ for doc_idx, score in hits:
242
+ if score < threshold:
243
+ continue
244
+ meta = store.metadata[doc_idx]
245
+ results.append(
246
+ KeywordResult(
247
+ file_path=meta.file_path,
248
+ start_line=meta.start_line,
249
+ end_line=meta.end_line,
250
+ language=meta.language,
251
+ content=meta.content,
252
+ score=score,
253
+ chunk_index=meta.chunk_index,
254
+ match_count=0,
255
+ matched_lines=[],
256
+ )
257
+ )
258
+ return results
259
+
260
+
261
+ def regex_search(
262
+ pattern: str,
263
+ store: VectorStore,
264
+ top_k: int = 10,
265
+ case_insensitive: bool = True,
266
+ ) -> list[KeywordResult]:
267
+ """Regex/grep-style search over indexed chunks.
268
+
269
+ Args:
270
+ pattern: Regex pattern string.
271
+ store: Loaded VectorStore with metadata.
272
+ top_k: Max results.
273
+ case_insensitive: Whether to use case-insensitive matching.
274
+
275
+ Returns:
276
+ Sorted list of KeywordResult (scored by match count).
277
+ """
278
+ if store.size == 0:
279
+ return []
280
+
281
+ flags = re.IGNORECASE if case_insensitive else 0
282
+ try:
283
+ compiled = re.compile(pattern, flags)
284
+ except re.error as exc:
285
+ logger.warning("Invalid regex pattern %r: %s", pattern, exc)
286
+ return []
287
+
288
+ results: list[KeywordResult] = []
289
+ for meta in store.metadata:
290
+ lines = meta.content.splitlines()
291
+ matched_lines: list[int] = []
292
+ for i, line in enumerate(lines):
293
+ if compiled.search(line):
294
+ matched_lines.append(meta.start_line + i)
295
+ if matched_lines:
296
+ results.append(
297
+ KeywordResult(
298
+ file_path=meta.file_path,
299
+ start_line=meta.start_line,
300
+ end_line=meta.end_line,
301
+ language=meta.language,
302
+ content=meta.content,
303
+ score=float(len(matched_lines)),
304
+ chunk_index=meta.chunk_index,
305
+ match_count=len(matched_lines),
306
+ matched_lines=matched_lines,
307
+ )
308
+ )
309
+
310
+ results.sort(key=lambda r: r.score, reverse=True)
311
+ return results[:top_k]
@@ -0,0 +1,103 @@
1
+ """Full-section extraction — expands search results to complete functions/classes.
2
+
3
+ When a search hit lands inside a function or class, this module looks up
4
+ the symbol registry to return the *entire* enclosing symbol body, not
5
+ just the matching chunk.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from semantic_code_intelligence.services.search_service import SearchResult
14
+ from semantic_code_intelligence.storage.symbol_registry import SymbolRegistry
15
+ from semantic_code_intelligence.utils.logging import get_logger
16
+
17
+ logger = get_logger("search.section")
18
+
19
+
20
+ def _read_lines(file_path: str, start: int, end: int) -> str:
21
+ """Read lines [start, end] (1-indexed) from a file."""
22
+ try:
23
+ all_lines = Path(file_path).read_text(encoding="utf-8", errors="replace").splitlines()
24
+ except (OSError, PermissionError):
25
+ return ""
26
+ s = max(0, start - 1)
27
+ e = min(len(all_lines), end)
28
+ return "\n".join(all_lines[s:e])
29
+
30
+
31
+ def expand_to_full_section(
32
+ results: list[SearchResult],
33
+ project_root: Path,
34
+ index_dir: Path,
35
+ ) -> list[SearchResult]:
36
+ """Expand each search result to the full enclosing function/class.
37
+
38
+ If a symbol boundary cannot be found (e.g. unsupported language),
39
+ the original result is returned unchanged.
40
+
41
+ Args:
42
+ results: Search results to expand.
43
+ project_root: Root of the project (for resolving paths).
44
+ index_dir: Index directory (for symbol registry).
45
+
46
+ Returns:
47
+ New list of SearchResult with expanded content and line ranges.
48
+ """
49
+ try:
50
+ registry = SymbolRegistry.load(index_dir)
51
+ except Exception:
52
+ logger.debug("Symbol registry not found; returning results unchanged.")
53
+ return results
54
+
55
+ expanded: list[SearchResult] = []
56
+ seen_keys: set[str] = set()
57
+
58
+ for r in results:
59
+ # Normalise to relative path for registry lookup
60
+ try:
61
+ rel = str(Path(r.file_path).relative_to(project_root))
62
+ except ValueError:
63
+ rel = r.file_path
64
+
65
+ # Find the tightest enclosing symbol
66
+ file_symbols = registry.find_by_file(rel)
67
+ best = None
68
+ best_span = float("inf")
69
+ for sym in file_symbols:
70
+ if sym.start_line <= r.start_line and sym.end_line >= r.end_line:
71
+ span = sym.end_line - sym.start_line
72
+ if span < best_span:
73
+ best = sym
74
+ best_span = span
75
+
76
+ if best is not None:
77
+ start = best.start_line
78
+ end = best.end_line
79
+ dedup_key = f"{r.file_path}:{start}:{end}"
80
+ if dedup_key in seen_keys:
81
+ continue
82
+ seen_keys.add(dedup_key)
83
+
84
+ content = _read_lines(r.file_path, start, end)
85
+ expanded.append(
86
+ SearchResult(
87
+ file_path=r.file_path,
88
+ start_line=start,
89
+ end_line=end,
90
+ language=r.language,
91
+ content=content or r.content,
92
+ score=r.score,
93
+ chunk_index=r.chunk_index,
94
+ )
95
+ )
96
+ else:
97
+ dedup_key = f"{r.file_path}:{r.start_line}:{r.end_line}"
98
+ if dedup_key in seen_keys:
99
+ continue
100
+ seen_keys.add(dedup_key)
101
+ expanded.append(r)
102
+
103
+ return expanded
File without changes