codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,214 @@
1
+ """Task selector — chooses the next small improvement task.
2
+
3
+ Analyses the current repository state (git diff, failing tests, code
4
+ quality signals) and picks a single, well-scoped task for the LLM to
5
+ implement. Every task targets **≤3 files** and **≤200 lines changed**.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+
13
+ from semantic_code_intelligence.evolution.commit_manager import CommitManager
14
+ from semantic_code_intelligence.evolution.test_runner import TestResult, TestRunner
15
+ from semantic_code_intelligence.utils.logging import get_logger
16
+
17
+ logger = get_logger("evolution.task_selector")
18
+
19
+ # Priority-ordered task categories
20
+ TASK_FIX_TESTS = "fix_failing_tests"
21
+ TASK_TYPE_HINTS = "add_missing_type_hints"
22
+ TASK_ERROR_HANDLING = "improve_error_handling"
23
+ TASK_REDUCE_DUPLICATION = "reduce_duplication"
24
+ TASK_SMALL_OPTIMISATION = "small_performance_optimisation"
25
+
26
+ TASK_PRIORITIES: list[str] = [
27
+ TASK_FIX_TESTS,
28
+ TASK_TYPE_HINTS,
29
+ TASK_ERROR_HANDLING,
30
+ TASK_REDUCE_DUPLICATION,
31
+ TASK_SMALL_OPTIMISATION,
32
+ ]
33
+
34
+
35
+ @dataclass
36
+ class EvolutionTask:
37
+ """A single, well-scoped improvement task."""
38
+
39
+ category: str
40
+ description: str
41
+ target_files: list[str] = field(default_factory=list)
42
+ context_hint: str = ""
43
+
44
+ def to_dict(self) -> dict[str, object]:
45
+ """Serialise the task to a plain dictionary."""
46
+ return {
47
+ "category": self.category,
48
+ "description": self.description,
49
+ "target_files": self.target_files,
50
+ "context_hint": self.context_hint,
51
+ }
52
+
53
+
54
+ class TaskSelector:
55
+ """Selects the next evolution task based on repo state."""
56
+
57
+ def __init__(
58
+ self,
59
+ project_root: Path,
60
+ test_runner: TestRunner,
61
+ commit_manager: CommitManager,
62
+ ) -> None:
63
+ self._root = project_root.resolve()
64
+ self._runner = test_runner
65
+ self._git = commit_manager
66
+
67
+ def select(self, last_test_result: TestResult | None = None) -> EvolutionTask:
68
+ """Choose the highest-priority actionable task.
69
+
70
+ 1. If tests are failing → fix them
71
+ 2. Else scan for missing type hints
72
+ 3. Else scan for bare excepts / weak error handling
73
+ 4. Else look for obvious duplication
74
+ 5. Fallback: small quality improvement
75
+ """
76
+ # Priority 1: fix failing tests
77
+ if last_test_result and not last_test_result.passed:
78
+ return self._task_from_failures(last_test_result)
79
+
80
+ # Priority 2–5: static analysis of source files
81
+ src_dir = self._root / "semantic_code_intelligence"
82
+ py_files = self._collect_py_files(src_dir)
83
+
84
+ task = self._find_type_hint_task(py_files)
85
+ if task:
86
+ return task
87
+
88
+ task = self._find_error_handling_task(py_files)
89
+ if task:
90
+ return task
91
+
92
+ task = self._find_duplication_task(py_files)
93
+ if task:
94
+ return task
95
+
96
+ # Fallback
97
+ return EvolutionTask(
98
+ category=TASK_SMALL_OPTIMISATION,
99
+ description="Look for a small quality or performance improvement in the codebase.",
100
+ target_files=[],
101
+ context_hint="Focus on hot-path functions or frequently used utilities.",
102
+ )
103
+
104
+ # ------------------------------------------------------------------ #
105
+ # Task builders
106
+ # ------------------------------------------------------------------ #
107
+
108
+ def _task_from_failures(self, result: TestResult) -> EvolutionTask:
109
+ """Extract a fix-tests task from failing test output."""
110
+ # Pull failing file hints from the output (pytest --tb=line gives file:line)
111
+ failing_files: list[str] = []
112
+ for line in result.output.splitlines():
113
+ stripped = line.strip()
114
+ if stripped.startswith("FAILED ") or "::" in stripped:
115
+ parts = stripped.split("::")
116
+ if parts:
117
+ fpath = parts[0].replace("FAILED ", "").strip()
118
+ if fpath.endswith(".py") and fpath not in failing_files:
119
+ failing_files.append(fpath)
120
+ return EvolutionTask(
121
+ category=TASK_FIX_TESTS,
122
+ description=f"Fix {result.failures} failing test(s).",
123
+ target_files=failing_files[:3],
124
+ context_hint=_last_n_lines(result.output, 40),
125
+ )
126
+
127
+ def _find_type_hint_task(self, files: list[Path]) -> EvolutionTask | None:
128
+ """Find a source file with functions lacking return type annotations."""
129
+ import re
130
+ pattern = re.compile(r"^\s*def\s+\w+\([^)]*\)\s*:", re.MULTILINE)
131
+ typed = re.compile(r"^\s*def\s+\w+\([^)]*\)\s*->\s*", re.MULTILINE)
132
+
133
+ for fpath in files:
134
+ try:
135
+ text = fpath.read_text(encoding="utf-8", errors="replace")
136
+ except OSError:
137
+ continue
138
+ all_defs = pattern.findall(text)
139
+ typed_defs = typed.findall(text)
140
+ missing = len(all_defs) - len(typed_defs)
141
+ if missing >= 2:
142
+ rel = str(fpath.relative_to(self._root))
143
+ return EvolutionTask(
144
+ category=TASK_TYPE_HINTS,
145
+ description=f"Add return type hints to {missing} function(s) in {rel}.",
146
+ target_files=[rel],
147
+ context_hint=f"File has {len(all_defs)} defs, {len(typed_defs)} typed.",
148
+ )
149
+ return None
150
+
151
+ def _find_error_handling_task(self, files: list[Path]) -> EvolutionTask | None:
152
+ """Find a file with bare ``except:`` or ``except Exception:`` blocks."""
153
+ for fpath in files:
154
+ try:
155
+ text = fpath.read_text(encoding="utf-8", errors="replace")
156
+ except OSError:
157
+ continue
158
+ if "\nexcept:" in text or "\nexcept Exception:" in text:
159
+ rel = str(fpath.relative_to(self._root))
160
+ return EvolutionTask(
161
+ category=TASK_ERROR_HANDLING,
162
+ description=f"Replace bare/broad except blocks with specific exceptions in {rel}.",
163
+ target_files=[rel],
164
+ context_hint="Catch only the exceptions that can actually occur.",
165
+ )
166
+ return None
167
+
168
+ def _find_duplication_task(self, files: list[Path]) -> EvolutionTask | None:
169
+ """Very lightweight duplication detector — looks for repeated blocks."""
170
+ # Simplified: look for files > 300 lines with repeated 5-line blocks
171
+ for fpath in files:
172
+ try:
173
+ lines = fpath.read_text(encoding="utf-8", errors="replace").splitlines()
174
+ except OSError:
175
+ continue
176
+ if len(lines) < 300:
177
+ continue
178
+ blocks: dict[str, int] = {}
179
+ for i in range(len(lines) - 4):
180
+ block = "\n".join(lines[i : i + 5]).strip()
181
+ if len(block) > 60:
182
+ blocks[block] = blocks.get(block, 0) + 1
183
+ dups = sum(1 for v in blocks.values() if v >= 2)
184
+ if dups >= 2:
185
+ rel = str(fpath.relative_to(self._root))
186
+ return EvolutionTask(
187
+ category=TASK_REDUCE_DUPLICATION,
188
+ description=f"Extract duplicated logic into helper functions in {rel}.",
189
+ target_files=[rel],
190
+ context_hint=f"Found {dups} repeated 5-line blocks.",
191
+ )
192
+ return None
193
+
194
+ # ------------------------------------------------------------------ #
195
+ # Helpers
196
+ # ------------------------------------------------------------------ #
197
+
198
+ def _collect_py_files(self, src_dir: Path) -> list[Path]:
199
+ """Collect .py source files, excluding tests and __pycache__."""
200
+ results: list[Path] = []
201
+ if not src_dir.exists():
202
+ return results
203
+ for fpath in sorted(src_dir.rglob("*.py")):
204
+ rel = str(fpath.relative_to(self._root))
205
+ if "tests" in rel or "__pycache__" in rel:
206
+ continue
207
+ results.append(fpath)
208
+ return results
209
+
210
+
211
+ def _last_n_lines(text: str, n: int) -> str:
212
+ """Return the last *n* non-empty lines of *text*."""
213
+ lines = [l for l in text.splitlines() if l.strip()]
214
+ return "\n".join(lines[-n:])
@@ -0,0 +1,111 @@
1
+ """Test runner — executes pytest and returns structured results.
2
+
3
+ Runs ``pytest`` as a subprocess to avoid polluting the current process
4
+ with imported test modules. Returns a structured ``TestResult`` that
5
+ the engine can use to decide whether to commit or revert.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import subprocess
11
+ import sys
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+
15
+ from semantic_code_intelligence.utils.logging import get_logger
16
+
17
+ logger = get_logger("evolution.test_runner")
18
+
19
+
20
+ @dataclass
21
+ class TestResult:
22
+ """Structured test-run result."""
23
+
24
+ __test__ = False # prevent pytest collection
25
+
26
+ passed: bool = False
27
+ total: int = 0
28
+ failures: int = 0
29
+ errors: int = 0
30
+ output: str = ""
31
+ return_code: int = -1
32
+
33
+ def summary_line(self) -> str:
34
+ status = "PASS" if self.passed else "FAIL"
35
+ return f"[{status}] {self.total} tests, {self.failures} failures, {self.errors} errors"
36
+
37
+
38
+ class TestRunner:
39
+ """Runs the project test suite via ``pytest``."""
40
+
41
+ __test__ = False # prevent pytest collection
42
+
43
+ def __init__(self, project_root: Path, timeout: int = 120) -> None:
44
+ self._root = project_root.resolve()
45
+ self._timeout = timeout
46
+
47
+ def run(self, extra_args: list[str] | None = None) -> TestResult:
48
+ """Run pytest and return a :class:`TestResult`.
49
+
50
+ Parameters
51
+ ----------
52
+ extra_args
53
+ Additional pytest CLI arguments (e.g. ``["-x", "--tb=short"]``).
54
+ """
55
+ cmd = [
56
+ sys.executable, "-m", "pytest",
57
+ str(self._root / "semantic_code_intelligence" / "tests"),
58
+ "-q", "--tb=line", "--no-header",
59
+ ]
60
+ if extra_args:
61
+ cmd.extend(extra_args)
62
+
63
+ logger.info("Running: %s", " ".join(cmd))
64
+ try:
65
+ proc = subprocess.run(
66
+ cmd,
67
+ capture_output=True,
68
+ text=True,
69
+ timeout=self._timeout,
70
+ cwd=str(self._root),
71
+ )
72
+ except subprocess.TimeoutExpired:
73
+ return TestResult(
74
+ passed=False,
75
+ output=f"pytest timed out after {self._timeout}s",
76
+ return_code=-1,
77
+ )
78
+
79
+ result = TestResult(
80
+ passed=proc.returncode == 0,
81
+ output=proc.stdout + proc.stderr,
82
+ return_code=proc.returncode,
83
+ )
84
+
85
+ # Parse summary line like "2258 passed, 3 warnings in 20.05s"
86
+ result.total, result.failures, result.errors = _parse_summary(result.output)
87
+ return result
88
+
89
+
90
+ def _parse_summary(output: str) -> tuple[int, int, int]:
91
+ """Extract passed/failed/error counts from pytest output."""
92
+ total = 0
93
+ failures = 0
94
+ errors = 0
95
+ for line in reversed(output.splitlines()):
96
+ line_lower = line.strip().lower()
97
+ if "passed" in line_lower or "failed" in line_lower or "error" in line_lower:
98
+ import re
99
+ m_passed = re.search(r"(\d+)\s+passed", line_lower)
100
+ m_failed = re.search(r"(\d+)\s+failed", line_lower)
101
+ m_error = re.search(r"(\d+)\s+error", line_lower)
102
+ if m_passed:
103
+ total += int(m_passed.group(1))
104
+ if m_failed:
105
+ failures = int(m_failed.group(1))
106
+ total += failures
107
+ if m_error:
108
+ errors = int(m_error.group(1))
109
+ total += errors
110
+ break
111
+ return total, failures, errors
File without changes
@@ -0,0 +1,174 @@
1
+ """Code chunker — splits source files into meaningful chunks for embedding."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+
9
+ @dataclass
10
+ class CodeChunk:
11
+ """A chunk of code extracted from a source file."""
12
+
13
+ file_path: str
14
+ content: str
15
+ start_line: int
16
+ end_line: int
17
+ chunk_index: int
18
+ language: str
19
+
20
+
21
+ # Map file extensions to language names
22
+ EXTENSION_TO_LANGUAGE: dict[str, str] = {
23
+ ".py": "python",
24
+ ".js": "javascript",
25
+ ".ts": "typescript",
26
+ ".jsx": "javascript",
27
+ ".tsx": "typescript",
28
+ ".java": "java",
29
+ ".go": "go",
30
+ ".rs": "rust",
31
+ ".c": "c",
32
+ ".cpp": "cpp",
33
+ ".h": "c",
34
+ ".hpp": "cpp",
35
+ ".rb": "ruby",
36
+ ".php": "php",
37
+ ".cs": "csharp",
38
+ ".swift": "swift",
39
+ ".kt": "kotlin",
40
+ ".scala": "scala",
41
+ ".sh": "shell",
42
+ ".bash": "shell",
43
+ ".sql": "sql",
44
+ ".r": "r",
45
+ ".lua": "lua",
46
+ ".dart": "dart",
47
+ ".ex": "elixir",
48
+ ".exs": "elixir",
49
+ }
50
+
51
+
52
+ def detect_language(file_path: str) -> str:
53
+ """Detect the programming language from a file extension.
54
+
55
+ Args:
56
+ file_path: Path to the source file.
57
+
58
+ Returns:
59
+ Language name string, or 'unknown' if unrecognized.
60
+ """
61
+ ext = Path(file_path).suffix.lower()
62
+ return EXTENSION_TO_LANGUAGE.get(ext, "unknown")
63
+
64
+
65
+ def chunk_code(
66
+ content: str,
67
+ file_path: str,
68
+ chunk_size: int = 512,
69
+ chunk_overlap: int = 64,
70
+ ) -> list[CodeChunk]:
71
+ """Split source code into overlapping chunks by line boundaries.
72
+
73
+ Chunks are split at line boundaries to preserve code structure.
74
+ Each chunk is at most chunk_size characters, with chunk_overlap
75
+ characters of overlap with the previous chunk.
76
+
77
+ Args:
78
+ content: The full source code string.
79
+ file_path: Path to the source file (for metadata).
80
+ chunk_size: Maximum characters per chunk.
81
+ chunk_overlap: Characters of overlap between consecutive chunks.
82
+
83
+ Returns:
84
+ List of CodeChunk objects.
85
+ """
86
+ if not content.strip():
87
+ return []
88
+
89
+ language = detect_language(file_path)
90
+ lines = content.splitlines(keepends=True)
91
+ chunks: list[CodeChunk] = []
92
+
93
+ current_chars = 0
94
+ chunk_start_line = 0
95
+ chunk_lines: list[str] = []
96
+ chunk_index = 0
97
+
98
+ for i, line in enumerate(lines):
99
+ chunk_lines.append(line)
100
+ current_chars += len(line)
101
+
102
+ if current_chars >= chunk_size:
103
+ chunk_text = "".join(chunk_lines)
104
+ chunks.append(
105
+ CodeChunk(
106
+ file_path=file_path,
107
+ content=chunk_text,
108
+ start_line=chunk_start_line + 1, # 1-indexed
109
+ end_line=i + 1,
110
+ chunk_index=chunk_index,
111
+ language=language,
112
+ )
113
+ )
114
+ chunk_index += 1
115
+
116
+ # Calculate overlap: walk backwards until we have enough overlap chars
117
+ overlap_chars = 0
118
+ overlap_start = len(chunk_lines)
119
+ for j in range(len(chunk_lines) - 1, -1, -1):
120
+ overlap_chars += len(chunk_lines[j])
121
+ if overlap_chars >= chunk_overlap:
122
+ overlap_start = j
123
+ break
124
+
125
+ chunk_lines = chunk_lines[overlap_start:]
126
+ chunk_start_line = i + 1 - len(chunk_lines) + 1
127
+ # But we need to preserve 0-indexed line tracking
128
+ chunk_start_line = (i + 1) - len(chunk_lines)
129
+ current_chars = sum(len(l) for l in chunk_lines)
130
+
131
+ # Emit the last chunk if there's remaining content
132
+ if chunk_lines:
133
+ chunk_text = "".join(chunk_lines)
134
+ if chunk_text.strip():
135
+ chunks.append(
136
+ CodeChunk(
137
+ file_path=file_path,
138
+ content=chunk_text,
139
+ start_line=chunk_start_line + 1,
140
+ end_line=len(lines),
141
+ chunk_index=chunk_index,
142
+ language=language,
143
+ )
144
+ )
145
+
146
+ return chunks
147
+
148
+
149
+ def chunk_file(
150
+ file_path: Path,
151
+ chunk_size: int = 512,
152
+ chunk_overlap: int = 64,
153
+ ) -> list[CodeChunk]:
154
+ """Read a file and split it into code chunks.
155
+
156
+ Args:
157
+ file_path: Path to the source file.
158
+ chunk_size: Maximum characters per chunk.
159
+ chunk_overlap: Characters of overlap.
160
+
161
+ Returns:
162
+ List of CodeChunk objects.
163
+ """
164
+ try:
165
+ content = file_path.read_text(encoding="utf-8", errors="replace")
166
+ except (OSError, PermissionError):
167
+ return []
168
+
169
+ return chunk_code(
170
+ content=content,
171
+ file_path=str(file_path),
172
+ chunk_size=chunk_size,
173
+ chunk_overlap=chunk_overlap,
174
+ )
@@ -0,0 +1,86 @@
1
+ """Parallel indexing utilities — concurrent file I/O and chunking.
2
+
3
+ Speeds up the scanning and chunking phases by processing files in
4
+ parallel using a thread pool, while embedding generation is batched
5
+ through the model (which already uses efficient GPU/CPU batching).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from semantic_code_intelligence.indexing.chunker import CodeChunk, chunk_file
15
+ from semantic_code_intelligence.indexing.scanner import ScannedFile, compute_file_hash
16
+ from semantic_code_intelligence.utils.logging import get_logger
17
+
18
+ logger = get_logger("indexing.parallel")
19
+
20
+ # Sensible default: don't overwhelm disk or CPU
21
+ DEFAULT_WORKERS = 4
22
+
23
+
24
+ def parallel_chunk_files(
25
+ files: list[ScannedFile],
26
+ chunk_size: int = 512,
27
+ chunk_overlap: int = 64,
28
+ max_workers: int = DEFAULT_WORKERS,
29
+ ) -> list[tuple[ScannedFile, list[CodeChunk]]]:
30
+ """Chunk multiple files in parallel using a thread pool.
31
+
32
+ Args:
33
+ files: List of scanned files to chunk.
34
+ chunk_size: Max characters per chunk.
35
+ chunk_overlap: Overlap between consecutive chunks.
36
+ max_workers: Number of threads.
37
+
38
+ Returns:
39
+ List of (ScannedFile, chunks) tuples in original order.
40
+ """
41
+ if not files:
42
+ return []
43
+
44
+ results: dict[int, tuple[ScannedFile, list[CodeChunk]]] = {}
45
+
46
+ def _chunk_one(idx: int, sf: ScannedFile) -> tuple[int, ScannedFile, list[CodeChunk]]:
47
+ chunks = chunk_file(sf.path, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
48
+ return idx, sf, chunks
49
+
50
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
51
+ futures = {
52
+ executor.submit(_chunk_one, i, sf): i
53
+ for i, sf in enumerate(files)
54
+ }
55
+ for future in as_completed(futures):
56
+ idx, sf, chunks = future.result()
57
+ results[idx] = (sf, chunks)
58
+
59
+ return [results[i] for i in range(len(files))]
60
+
61
+
62
+ def parallel_scan_hashes(
63
+ file_paths: list[Path],
64
+ max_workers: int = DEFAULT_WORKERS,
65
+ ) -> dict[Path, str]:
66
+ """Compute file hashes in parallel.
67
+
68
+ Args:
69
+ file_paths: Files to hash.
70
+ max_workers: Number of threads.
71
+
72
+ Returns:
73
+ Mapping of path → SHA-256 hex digest.
74
+ """
75
+ result: dict[Path, str] = {}
76
+
77
+ def _hash_one(p: Path) -> tuple[Path, str]:
78
+ return p, compute_file_hash(p)
79
+
80
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
81
+ futures = [executor.submit(_hash_one, p) for p in file_paths]
82
+ for future in as_completed(futures):
83
+ p, h = future.result()
84
+ result[p] = h
85
+
86
+ return result