codexa 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. codexa-0.4.0.dist-info/METADATA +650 -0
  2. codexa-0.4.0.dist-info/RECORD +189 -0
  3. codexa-0.4.0.dist-info/WHEEL +5 -0
  4. codexa-0.4.0.dist-info/entry_points.txt +2 -0
  5. codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. codexa-0.4.0.dist-info/top_level.txt +1 -0
  7. semantic_code_intelligence/__init__.py +5 -0
  8. semantic_code_intelligence/analysis/__init__.py +21 -0
  9. semantic_code_intelligence/analysis/ai_features.py +351 -0
  10. semantic_code_intelligence/bridge/__init__.py +28 -0
  11. semantic_code_intelligence/bridge/context_provider.py +245 -0
  12. semantic_code_intelligence/bridge/protocol.py +167 -0
  13. semantic_code_intelligence/bridge/server.py +348 -0
  14. semantic_code_intelligence/bridge/vscode.py +271 -0
  15. semantic_code_intelligence/ci/__init__.py +13 -0
  16. semantic_code_intelligence/ci/hooks.py +98 -0
  17. semantic_code_intelligence/ci/hotspots.py +272 -0
  18. semantic_code_intelligence/ci/impact.py +246 -0
  19. semantic_code_intelligence/ci/metrics.py +591 -0
  20. semantic_code_intelligence/ci/pr.py +412 -0
  21. semantic_code_intelligence/ci/quality.py +557 -0
  22. semantic_code_intelligence/ci/templates.py +164 -0
  23. semantic_code_intelligence/ci/trace.py +224 -0
  24. semantic_code_intelligence/cli/__init__.py +0 -0
  25. semantic_code_intelligence/cli/commands/__init__.py +0 -0
  26. semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
  27. semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
  28. semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
  29. semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
  30. semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
  31. semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
  32. semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
  33. semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
  34. semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
  35. semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
  36. semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
  37. semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
  38. semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
  39. semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
  40. semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
  41. semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
  42. semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
  43. semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
  44. semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
  45. semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
  46. semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
  47. semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
  48. semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
  49. semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
  50. semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
  51. semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
  52. semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
  53. semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
  54. semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
  55. semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
  56. semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
  57. semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
  58. semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
  59. semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
  60. semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
  61. semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
  62. semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
  63. semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
  64. semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
  65. semantic_code_intelligence/cli/main.py +65 -0
  66. semantic_code_intelligence/cli/router.py +92 -0
  67. semantic_code_intelligence/config/__init__.py +0 -0
  68. semantic_code_intelligence/config/settings.py +260 -0
  69. semantic_code_intelligence/context/__init__.py +19 -0
  70. semantic_code_intelligence/context/engine.py +429 -0
  71. semantic_code_intelligence/context/memory.py +253 -0
  72. semantic_code_intelligence/daemon/__init__.py +1 -0
  73. semantic_code_intelligence/daemon/watcher.py +515 -0
  74. semantic_code_intelligence/docs/__init__.py +1080 -0
  75. semantic_code_intelligence/embeddings/__init__.py +0 -0
  76. semantic_code_intelligence/embeddings/enhanced.py +131 -0
  77. semantic_code_intelligence/embeddings/generator.py +149 -0
  78. semantic_code_intelligence/embeddings/model_registry.py +100 -0
  79. semantic_code_intelligence/evolution/__init__.py +1 -0
  80. semantic_code_intelligence/evolution/budget_guard.py +111 -0
  81. semantic_code_intelligence/evolution/commit_manager.py +88 -0
  82. semantic_code_intelligence/evolution/context_builder.py +131 -0
  83. semantic_code_intelligence/evolution/engine.py +249 -0
  84. semantic_code_intelligence/evolution/patch_generator.py +229 -0
  85. semantic_code_intelligence/evolution/task_selector.py +214 -0
  86. semantic_code_intelligence/evolution/test_runner.py +111 -0
  87. semantic_code_intelligence/indexing/__init__.py +0 -0
  88. semantic_code_intelligence/indexing/chunker.py +174 -0
  89. semantic_code_intelligence/indexing/parallel.py +86 -0
  90. semantic_code_intelligence/indexing/scanner.py +146 -0
  91. semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
  92. semantic_code_intelligence/llm/__init__.py +62 -0
  93. semantic_code_intelligence/llm/cache.py +219 -0
  94. semantic_code_intelligence/llm/cached_provider.py +145 -0
  95. semantic_code_intelligence/llm/conversation.py +190 -0
  96. semantic_code_intelligence/llm/cross_refactor.py +272 -0
  97. semantic_code_intelligence/llm/investigation.py +274 -0
  98. semantic_code_intelligence/llm/mock_provider.py +77 -0
  99. semantic_code_intelligence/llm/ollama_provider.py +122 -0
  100. semantic_code_intelligence/llm/openai_provider.py +100 -0
  101. semantic_code_intelligence/llm/provider.py +92 -0
  102. semantic_code_intelligence/llm/rate_limiter.py +164 -0
  103. semantic_code_intelligence/llm/reasoning.py +438 -0
  104. semantic_code_intelligence/llm/safety.py +110 -0
  105. semantic_code_intelligence/llm/streaming.py +251 -0
  106. semantic_code_intelligence/lsp/__init__.py +609 -0
  107. semantic_code_intelligence/mcp/__init__.py +393 -0
  108. semantic_code_intelligence/parsing/__init__.py +19 -0
  109. semantic_code_intelligence/parsing/parser.py +375 -0
  110. semantic_code_intelligence/plugins/__init__.py +255 -0
  111. semantic_code_intelligence/plugins/examples/__init__.py +1 -0
  112. semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
  113. semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
  114. semantic_code_intelligence/scalability/__init__.py +205 -0
  115. semantic_code_intelligence/search/__init__.py +0 -0
  116. semantic_code_intelligence/search/formatter.py +123 -0
  117. semantic_code_intelligence/search/grep.py +361 -0
  118. semantic_code_intelligence/search/hybrid_search.py +170 -0
  119. semantic_code_intelligence/search/keyword_search.py +311 -0
  120. semantic_code_intelligence/search/section_expander.py +103 -0
  121. semantic_code_intelligence/services/__init__.py +0 -0
  122. semantic_code_intelligence/services/indexing_service.py +630 -0
  123. semantic_code_intelligence/services/search_service.py +269 -0
  124. semantic_code_intelligence/storage/__init__.py +0 -0
  125. semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
  126. semantic_code_intelligence/storage/hash_store.py +66 -0
  127. semantic_code_intelligence/storage/index_manifest.py +85 -0
  128. semantic_code_intelligence/storage/index_stats.py +138 -0
  129. semantic_code_intelligence/storage/query_history.py +160 -0
  130. semantic_code_intelligence/storage/symbol_registry.py +209 -0
  131. semantic_code_intelligence/storage/vector_store.py +297 -0
  132. semantic_code_intelligence/tests/__init__.py +0 -0
  133. semantic_code_intelligence/tests/test_ai_features.py +351 -0
  134. semantic_code_intelligence/tests/test_chunker.py +119 -0
  135. semantic_code_intelligence/tests/test_cli.py +188 -0
  136. semantic_code_intelligence/tests/test_config.py +154 -0
  137. semantic_code_intelligence/tests/test_context.py +381 -0
  138. semantic_code_intelligence/tests/test_embeddings.py +73 -0
  139. semantic_code_intelligence/tests/test_endtoend.py +1142 -0
  140. semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
  141. semantic_code_intelligence/tests/test_hash_store.py +79 -0
  142. semantic_code_intelligence/tests/test_logging.py +55 -0
  143. semantic_code_intelligence/tests/test_new_cli.py +138 -0
  144. semantic_code_intelligence/tests/test_parser.py +495 -0
  145. semantic_code_intelligence/tests/test_phase10.py +355 -0
  146. semantic_code_intelligence/tests/test_phase11.py +593 -0
  147. semantic_code_intelligence/tests/test_phase12.py +375 -0
  148. semantic_code_intelligence/tests/test_phase13.py +663 -0
  149. semantic_code_intelligence/tests/test_phase14.py +568 -0
  150. semantic_code_intelligence/tests/test_phase15.py +814 -0
  151. semantic_code_intelligence/tests/test_phase16.py +792 -0
  152. semantic_code_intelligence/tests/test_phase17.py +815 -0
  153. semantic_code_intelligence/tests/test_phase18.py +934 -0
  154. semantic_code_intelligence/tests/test_phase19.py +986 -0
  155. semantic_code_intelligence/tests/test_phase20.py +2753 -0
  156. semantic_code_intelligence/tests/test_phase20b.py +2058 -0
  157. semantic_code_intelligence/tests/test_phase20c.py +962 -0
  158. semantic_code_intelligence/tests/test_phase21.py +428 -0
  159. semantic_code_intelligence/tests/test_phase22.py +799 -0
  160. semantic_code_intelligence/tests/test_phase23.py +783 -0
  161. semantic_code_intelligence/tests/test_phase24.py +715 -0
  162. semantic_code_intelligence/tests/test_phase25.py +496 -0
  163. semantic_code_intelligence/tests/test_phase26.py +251 -0
  164. semantic_code_intelligence/tests/test_phase27.py +531 -0
  165. semantic_code_intelligence/tests/test_phase8.py +592 -0
  166. semantic_code_intelligence/tests/test_phase9.py +643 -0
  167. semantic_code_intelligence/tests/test_plugins.py +293 -0
  168. semantic_code_intelligence/tests/test_priority_features.py +727 -0
  169. semantic_code_intelligence/tests/test_router.py +41 -0
  170. semantic_code_intelligence/tests/test_scalability.py +138 -0
  171. semantic_code_intelligence/tests/test_scanner.py +125 -0
  172. semantic_code_intelligence/tests/test_search.py +160 -0
  173. semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
  174. semantic_code_intelligence/tests/test_tools.py +182 -0
  175. semantic_code_intelligence/tests/test_vector_store.py +151 -0
  176. semantic_code_intelligence/tests/test_watcher.py +211 -0
  177. semantic_code_intelligence/tools/__init__.py +442 -0
  178. semantic_code_intelligence/tools/executor.py +232 -0
  179. semantic_code_intelligence/tools/protocol.py +200 -0
  180. semantic_code_intelligence/tui/__init__.py +454 -0
  181. semantic_code_intelligence/utils/__init__.py +0 -0
  182. semantic_code_intelligence/utils/logging.py +112 -0
  183. semantic_code_intelligence/version.py +3 -0
  184. semantic_code_intelligence/web/__init__.py +11 -0
  185. semantic_code_intelligence/web/api.py +289 -0
  186. semantic_code_intelligence/web/server.py +397 -0
  187. semantic_code_intelligence/web/ui.py +659 -0
  188. semantic_code_intelligence/web/visualize.py +226 -0
  189. semantic_code_intelligence/workspace/__init__.py +427 -0
@@ -0,0 +1,146 @@
1
+ """Repository scanner — walks the file tree and filters indexable files.
2
+
3
+ Respects ``.gitignore`` and ``.codexaignore`` patterns for fine-grained
4
+ exclusion control.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import fnmatch
10
+ import hashlib
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+
14
+ from semantic_code_intelligence.config.settings import AppConfig, IndexConfig
15
+
16
+
17
+ @dataclass
18
+ class ScannedFile:
19
+ """Represents a single file discovered during scanning."""
20
+
21
+ path: Path
22
+ relative_path: str
23
+ extension: str
24
+ size_bytes: int
25
+ content_hash: str
26
+
27
+
28
+ def compute_file_hash(file_path: Path) -> str:
29
+ """Compute a SHA-256 hash of a file's contents for change detection.
30
+
31
+ Args:
32
+ file_path: Path to the file.
33
+
34
+ Returns:
35
+ Hex-encoded SHA-256 digest.
36
+ """
37
+ hasher = hashlib.sha256()
38
+ with open(file_path, "rb") as f:
39
+ for chunk in iter(lambda: f.read(8192), b""):
40
+ hasher.update(chunk)
41
+ return hasher.hexdigest()
42
+
43
+
44
+ def _load_ignore_patterns(root: Path) -> list[str]:
45
+ """Load glob patterns from .codexaignore file (if it exists).
46
+
47
+ Each non-empty, non-comment line is treated as a glob pattern
48
+ matched against relative paths (similar to .gitignore).
49
+ """
50
+ ignore_file = root / ".codexaignore"
51
+ if not ignore_file.exists():
52
+ return []
53
+ patterns: list[str] = []
54
+ for line in ignore_file.read_text(encoding="utf-8", errors="replace").splitlines():
55
+ stripped = line.strip()
56
+ if stripped and not stripped.startswith("#"):
57
+ patterns.append(stripped)
58
+ return patterns
59
+
60
+
61
+ def _matches_ignore_patterns(relative_path: str, patterns: list[str]) -> bool:
62
+ """Check whether a relative path matches any .codexaignore pattern."""
63
+ for pattern in patterns:
64
+ if fnmatch.fnmatch(relative_path, pattern):
65
+ return True
66
+ # Also check against each path component for directory patterns
67
+ if fnmatch.fnmatch(relative_path.replace("\\", "/"), pattern):
68
+ return True
69
+ return False
70
+
71
+
72
+ def should_ignore(path: Path, root: Path, ignore_dirs: set[str]) -> bool:
73
+ """Check if a path should be ignored based on directory names.
74
+
75
+ Args:
76
+ path: The file or directory path to check.
77
+ root: The project root path.
78
+ ignore_dirs: Set of directory names to ignore.
79
+
80
+ Returns:
81
+ True if the path should be skipped.
82
+ """
83
+ try:
84
+ parts = path.relative_to(root).parts
85
+ except ValueError:
86
+ return True
87
+ return any(part in ignore_dirs for part in parts)
88
+
89
+
90
+ def scan_repository(
91
+ root: Path,
92
+ index_config: IndexConfig | None = None,
93
+ ) -> list[ScannedFile]:
94
+ """Scan a repository and return a list of indexable files.
95
+
96
+ Respects both the config-based ``ignore_dirs`` and any patterns
97
+ defined in ``.codexaignore`` at the project root.
98
+
99
+ Args:
100
+ root: Root directory to scan.
101
+ index_config: Indexing configuration. Uses defaults if None.
102
+
103
+ Returns:
104
+ List of ScannedFile objects for all matching files.
105
+ """
106
+ if index_config is None:
107
+ index_config = IndexConfig()
108
+
109
+ root = root.resolve()
110
+ ignore_patterns = _load_ignore_patterns(root)
111
+ results: list[ScannedFile] = []
112
+
113
+ for file_path in sorted(root.rglob("*")):
114
+ if not file_path.is_file():
115
+ continue
116
+
117
+ if file_path.suffix not in index_config.extensions:
118
+ continue
119
+
120
+ if should_ignore(file_path, root, index_config.ignore_dirs):
121
+ continue
122
+
123
+ # Check .codexaignore patterns
124
+ try:
125
+ rel = str(file_path.relative_to(root)).replace("\\", "/")
126
+ except ValueError:
127
+ continue
128
+ if ignore_patterns and _matches_ignore_patterns(rel, ignore_patterns):
129
+ continue
130
+
131
+ try:
132
+ size = file_path.stat().st_size
133
+ content_hash = compute_file_hash(file_path)
134
+ results.append(
135
+ ScannedFile(
136
+ path=file_path,
137
+ relative_path=str(file_path.relative_to(root)),
138
+ extension=file_path.suffix,
139
+ size_bytes=size,
140
+ content_hash=content_hash,
141
+ )
142
+ )
143
+ except (OSError, PermissionError):
144
+ continue
145
+
146
+ return results
@@ -0,0 +1,337 @@
1
+ """AST-aware semantic chunker — splits code along structural boundaries.
2
+
3
+ Uses tree-sitter parsed symbols to produce chunks aligned to function,
4
+ class, and method boundaries rather than arbitrary line counts. Falls
5
+ back to the line-based chunker for unsupported languages.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from semantic_code_intelligence.indexing.chunker import (
15
+ CodeChunk,
16
+ chunk_code,
17
+ detect_language,
18
+ )
19
+ from semantic_code_intelligence.parsing.parser import (
20
+ Symbol,
21
+ parse_file,
22
+ detect_language as detect_ts_language,
23
+ )
24
+ from semantic_code_intelligence.utils.logging import get_logger
25
+
26
+ logger = get_logger("indexing.semantic_chunker")
27
+
28
+
29
+ @dataclass
30
+ class SemanticChunk(CodeChunk):
31
+ """A chunk with additional semantic metadata."""
32
+
33
+ symbol_name: str = ""
34
+ symbol_kind: str = "" # "function", "class", "method", "module_header", "block"
35
+ parent_symbol: str = ""
36
+ parameters: list[str] = field(default_factory=list)
37
+ semantic_label: str = "" # human-readable label for the chunk
38
+
39
+ def to_dict(self) -> dict[str, Any]:
40
+ return {
41
+ "file_path": self.file_path,
42
+ "content": self.content,
43
+ "start_line": self.start_line,
44
+ "end_line": self.end_line,
45
+ "chunk_index": self.chunk_index,
46
+ "language": self.language,
47
+ "symbol_name": self.symbol_name,
48
+ "symbol_kind": self.symbol_kind,
49
+ "parent_symbol": self.parent_symbol,
50
+ "parameters": self.parameters,
51
+ "semantic_label": self.semantic_label,
52
+ }
53
+
54
+
55
+ def _build_semantic_label(chunk: "SemanticChunk") -> str:
56
+ """Build a human-readable label for embedding prepend."""
57
+ parts: list[str] = []
58
+ if chunk.language and chunk.language != "unknown":
59
+ parts.append(f"[{chunk.language}]")
60
+ if chunk.symbol_kind:
61
+ parts.append(chunk.symbol_kind)
62
+ if chunk.parent_symbol:
63
+ parts.append(f"{chunk.parent_symbol}.{chunk.symbol_name}")
64
+ elif chunk.symbol_name:
65
+ parts.append(chunk.symbol_name)
66
+ if chunk.parameters:
67
+ parts.append(f"({', '.join(chunk.parameters)})")
68
+ return " ".join(parts)
69
+
70
+
71
+ def _symbols_to_chunks(
72
+ symbols: list[Symbol],
73
+ content: str,
74
+ file_path: str,
75
+ language: str,
76
+ max_chunk_size: int = 512,
77
+ ) -> list[SemanticChunk]:
78
+ """Convert parsed symbols into semantic chunks.
79
+
80
+ Large symbols that exceed max_chunk_size are sub-split at line
81
+ boundaries while preserving the semantic metadata.
82
+ """
83
+ chunks: list[SemanticChunk] = []
84
+ lines = content.splitlines(keepends=True)
85
+ covered_lines: set[int] = set() # 1-indexed lines covered by symbols
86
+ chunk_index = 0
87
+
88
+ # Sort symbols by start_line for deterministic output
89
+ sorted_symbols = sorted(symbols, key=lambda s: (s.start_line, -s.end_line))
90
+
91
+ for sym in sorted_symbols:
92
+ if sym.kind == "import":
93
+ continue # imports are collected separately
94
+
95
+ body = sym.body
96
+ if not body.strip():
97
+ continue
98
+
99
+ # Mark lines as covered
100
+ for ln in range(sym.start_line, sym.end_line + 1):
101
+ covered_lines.add(ln)
102
+
103
+ # If body fits in one chunk, emit directly
104
+ if len(body) <= max_chunk_size:
105
+ sc = SemanticChunk(
106
+ file_path=file_path,
107
+ content=body,
108
+ start_line=sym.start_line,
109
+ end_line=sym.end_line,
110
+ chunk_index=chunk_index,
111
+ language=language,
112
+ symbol_name=sym.name,
113
+ symbol_kind=sym.kind,
114
+ parent_symbol=sym.parent or "",
115
+ parameters=list(sym.parameters),
116
+ )
117
+ sc.semantic_label = _build_semantic_label(sc)
118
+ chunks.append(sc)
119
+ chunk_index += 1
120
+ else:
121
+ # Sub-split large symbols at line boundaries
122
+ body_lines = body.splitlines(keepends=True)
123
+ sub_lines: list[str] = []
124
+ sub_start = sym.start_line
125
+ sub_chars = 0
126
+
127
+ for offset, line in enumerate(body_lines):
128
+ sub_lines.append(line)
129
+ sub_chars += len(line)
130
+
131
+ if sub_chars >= max_chunk_size:
132
+ sc = SemanticChunk(
133
+ file_path=file_path,
134
+ content="".join(sub_lines),
135
+ start_line=sub_start,
136
+ end_line=sym.start_line + offset,
137
+ chunk_index=chunk_index,
138
+ language=language,
139
+ symbol_name=sym.name,
140
+ symbol_kind=sym.kind,
141
+ parent_symbol=sym.parent or "",
142
+ parameters=list(sym.parameters),
143
+ )
144
+ sc.semantic_label = _build_semantic_label(sc)
145
+ chunks.append(sc)
146
+ chunk_index += 1
147
+ sub_lines = []
148
+ sub_start = sym.start_line + offset + 1
149
+ sub_chars = 0
150
+
151
+ if sub_lines and "".join(sub_lines).strip():
152
+ sc = SemanticChunk(
153
+ file_path=file_path,
154
+ content="".join(sub_lines),
155
+ start_line=sub_start,
156
+ end_line=sym.end_line,
157
+ chunk_index=chunk_index,
158
+ language=language,
159
+ symbol_name=sym.name,
160
+ symbol_kind=sym.kind,
161
+ parent_symbol=sym.parent or "",
162
+ parameters=list(sym.parameters),
163
+ )
164
+ sc.semantic_label = _build_semantic_label(sc)
165
+ chunks.append(sc)
166
+ chunk_index += 1
167
+
168
+ # Collect uncovered regions (module-level code, imports header, etc.)
169
+ uncovered_blocks = _extract_uncovered_blocks(lines, covered_lines)
170
+ for start_line, end_line, block_content in uncovered_blocks:
171
+ if not block_content.strip():
172
+ continue
173
+ if len(block_content) <= max_chunk_size:
174
+ sc = SemanticChunk(
175
+ file_path=file_path,
176
+ content=block_content,
177
+ start_line=start_line,
178
+ end_line=end_line,
179
+ chunk_index=chunk_index,
180
+ language=language,
181
+ symbol_name="",
182
+ symbol_kind="module_header" if start_line <= 5 else "block",
183
+ )
184
+ sc.semantic_label = _build_semantic_label(sc)
185
+ chunks.append(sc)
186
+ chunk_index += 1
187
+ else:
188
+ # Sub-split large uncovered blocks
189
+ block_lines = block_content.splitlines(keepends=True)
190
+ buf: list[str] = []
191
+ buf_start = start_line
192
+ buf_chars = 0
193
+ for offset, line in enumerate(block_lines):
194
+ buf.append(line)
195
+ buf_chars += len(line)
196
+ if buf_chars >= max_chunk_size:
197
+ sc = SemanticChunk(
198
+ file_path=file_path,
199
+ content="".join(buf),
200
+ start_line=buf_start,
201
+ end_line=start_line + offset,
202
+ chunk_index=chunk_index,
203
+ language=language,
204
+ symbol_name="",
205
+ symbol_kind="block",
206
+ )
207
+ sc.semantic_label = _build_semantic_label(sc)
208
+ chunks.append(sc)
209
+ chunk_index += 1
210
+ buf = []
211
+ buf_start = start_line + offset + 1
212
+ buf_chars = 0
213
+ if buf and "".join(buf).strip():
214
+ sc = SemanticChunk(
215
+ file_path=file_path,
216
+ content="".join(buf),
217
+ start_line=buf_start,
218
+ end_line=end_line,
219
+ chunk_index=chunk_index,
220
+ language=language,
221
+ symbol_name="",
222
+ symbol_kind="block",
223
+ )
224
+ sc.semantic_label = _build_semantic_label(sc)
225
+ chunks.append(sc)
226
+ chunk_index += 1
227
+
228
+ # Sort by start_line for stable ordering
229
+ chunks.sort(key=lambda c: c.start_line)
230
+ for i, c in enumerate(chunks):
231
+ c.chunk_index = i
232
+
233
+ return chunks
234
+
235
+
236
+ def _extract_uncovered_blocks(
237
+ lines: list[str],
238
+ covered_lines: set[int],
239
+ ) -> list[tuple[int, int, str]]:
240
+ """Find contiguous blocks of lines not covered by any symbol.
241
+
242
+ Returns list of (start_line, end_line, content) tuples (1-indexed).
243
+ """
244
+ blocks: list[tuple[int, int, str]] = []
245
+ block_start: int | None = None
246
+ block_lines: list[str] = []
247
+
248
+ for i, line in enumerate(lines):
249
+ line_num = i + 1 # 1-indexed
250
+ if line_num not in covered_lines:
251
+ if block_start is None:
252
+ block_start = line_num
253
+ block_lines.append(line)
254
+ else:
255
+ if block_start is not None:
256
+ blocks.append((block_start, line_num - 1, "".join(block_lines)))
257
+ block_start = None
258
+ block_lines = []
259
+
260
+ if block_start is not None:
261
+ blocks.append((block_start, len(lines), "".join(block_lines)))
262
+
263
+ return blocks
264
+
265
+
266
+ def semantic_chunk_code(
267
+ content: str,
268
+ file_path: str,
269
+ chunk_size: int = 512,
270
+ chunk_overlap: int = 64,
271
+ ) -> list[SemanticChunk]:
272
+ """Split code into semantically meaningful chunks using AST analysis.
273
+
274
+ For supported languages (Python, JS, TypeScript, Java, Go, Rust, C++,
275
+ C#, Ruby, PHP), uses tree-sitter to identify symbol boundaries and
276
+ produces chunks aligned to function, class, and method definitions.
277
+
278
+ For unsupported languages, falls back to line-boundary chunking and
279
+ wraps the result as SemanticChunk objects.
280
+
281
+ Args:
282
+ content: Full source code string.
283
+ file_path: Path for language detection and metadata.
284
+ chunk_size: Maximum characters per chunk.
285
+ chunk_overlap: Overlap chars (used only in fallback mode).
286
+
287
+ Returns:
288
+ List of SemanticChunk objects.
289
+ """
290
+ if not content.strip():
291
+ return []
292
+
293
+ language = detect_language(file_path)
294
+ ts_language = detect_ts_language(file_path)
295
+
296
+ # If tree-sitter supports this language, use AST-aware chunking
297
+ if ts_language is not None:
298
+ symbols = parse_file(file_path, content)
299
+ if symbols:
300
+ return _symbols_to_chunks(symbols, content, file_path, language, chunk_size)
301
+
302
+ # Fallback: wrap line-based chunks as SemanticChunks
303
+ line_chunks = chunk_code(content, file_path, chunk_size, chunk_overlap)
304
+ return [
305
+ SemanticChunk(
306
+ file_path=c.file_path,
307
+ content=c.content,
308
+ start_line=c.start_line,
309
+ end_line=c.end_line,
310
+ chunk_index=c.chunk_index,
311
+ language=c.language,
312
+ symbol_kind="block",
313
+ )
314
+ for c in line_chunks
315
+ ]
316
+
317
+
318
+ def semantic_chunk_file(
319
+ file_path: Path,
320
+ chunk_size: int = 512,
321
+ chunk_overlap: int = 64,
322
+ ) -> list[SemanticChunk]:
323
+ """Read a file and split into semantic chunks.
324
+
325
+ Args:
326
+ file_path: Path to the source file.
327
+ chunk_size: Maximum characters per chunk.
328
+ chunk_overlap: Overlap for fallback mode.
329
+
330
+ Returns:
331
+ List of SemanticChunk objects.
332
+ """
333
+ try:
334
+ content = Path(file_path).read_text(encoding="utf-8", errors="replace")
335
+ except (OSError, PermissionError):
336
+ return []
337
+ return semantic_chunk_code(content, str(file_path), chunk_size, chunk_overlap)
@@ -0,0 +1,62 @@
1
+ """LLM integration layer — provider abstraction, reasoning engine, and safety.
2
+
3
+ Provides:
4
+ - LLMProvider: abstract base class for LLM backends
5
+ - OpenAIProvider: OpenAI API integration
6
+ - OllamaProvider: Ollama local model integration
7
+ - MockProvider: deterministic mock for testing
8
+ - CachedProvider: transparent caching and rate limiting wrapper
9
+ - LLMCache / CacheStats: disk-backed response cache with TTL
10
+ - RateLimiter / RateLimitExceeded: sliding-window rate limiting
11
+ - ReasoningEngine: orchestrates context + LLM for AI-assisted tasks
12
+ - SafetyValidator: validates LLM outputs before applying
13
+ - ConversationSession / SessionStore: multi-turn conversation persistence
14
+ - InvestigationChain: autonomous multi-step code investigation
15
+ - stream_chat / StreamEvent: streaming LLM responses with plugin hooks
16
+ - analyze_cross_repo: cross-repo refactoring suggestions
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from semantic_code_intelligence.llm.provider import (
22
+ LLMProvider,
23
+ LLMResponse,
24
+ LLMMessage,
25
+ )
26
+ from semantic_code_intelligence.llm.openai_provider import OpenAIProvider
27
+ from semantic_code_intelligence.llm.ollama_provider import OllamaProvider
28
+ from semantic_code_intelligence.llm.mock_provider import MockProvider
29
+ from semantic_code_intelligence.llm.cached_provider import CachedProvider
30
+ from semantic_code_intelligence.llm.cache import LLMCache, CacheStats
31
+ from semantic_code_intelligence.llm.rate_limiter import RateLimiter, RateLimitExceeded, RateLimiterStats
32
+ from semantic_code_intelligence.llm.reasoning import ReasoningEngine
33
+ from semantic_code_intelligence.llm.safety import SafetyValidator
34
+ from semantic_code_intelligence.llm.conversation import ConversationSession, SessionStore
35
+ from semantic_code_intelligence.llm.investigation import InvestigationChain, InvestigationResult
36
+ from semantic_code_intelligence.llm.streaming import stream_chat, StreamEvent
37
+ from semantic_code_intelligence.llm.cross_refactor import analyze_cross_repo, CrossRefactorResult
38
+
39
+ __all__ = [
40
+ "LLMProvider",
41
+ "LLMResponse",
42
+ "LLMMessage",
43
+ "OpenAIProvider",
44
+ "OllamaProvider",
45
+ "MockProvider",
46
+ "CachedProvider",
47
+ "LLMCache",
48
+ "CacheStats",
49
+ "RateLimiter",
50
+ "RateLimitExceeded",
51
+ "RateLimiterStats",
52
+ "ReasoningEngine",
53
+ "SafetyValidator",
54
+ "ConversationSession",
55
+ "SessionStore",
56
+ "InvestigationChain",
57
+ "InvestigationResult",
58
+ "stream_chat",
59
+ "StreamEvent",
60
+ "analyze_cross_repo",
61
+ "CrossRefactorResult",
62
+ ]