codexa 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codexa-0.4.0.dist-info/METADATA +650 -0
- codexa-0.4.0.dist-info/RECORD +189 -0
- codexa-0.4.0.dist-info/WHEEL +5 -0
- codexa-0.4.0.dist-info/entry_points.txt +2 -0
- codexa-0.4.0.dist-info/licenses/LICENSE +21 -0
- codexa-0.4.0.dist-info/top_level.txt +1 -0
- semantic_code_intelligence/__init__.py +5 -0
- semantic_code_intelligence/analysis/__init__.py +21 -0
- semantic_code_intelligence/analysis/ai_features.py +351 -0
- semantic_code_intelligence/bridge/__init__.py +28 -0
- semantic_code_intelligence/bridge/context_provider.py +245 -0
- semantic_code_intelligence/bridge/protocol.py +167 -0
- semantic_code_intelligence/bridge/server.py +348 -0
- semantic_code_intelligence/bridge/vscode.py +271 -0
- semantic_code_intelligence/ci/__init__.py +13 -0
- semantic_code_intelligence/ci/hooks.py +98 -0
- semantic_code_intelligence/ci/hotspots.py +272 -0
- semantic_code_intelligence/ci/impact.py +246 -0
- semantic_code_intelligence/ci/metrics.py +591 -0
- semantic_code_intelligence/ci/pr.py +412 -0
- semantic_code_intelligence/ci/quality.py +557 -0
- semantic_code_intelligence/ci/templates.py +164 -0
- semantic_code_intelligence/ci/trace.py +224 -0
- semantic_code_intelligence/cli/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/__init__.py +0 -0
- semantic_code_intelligence/cli/commands/ask_cmd.py +153 -0
- semantic_code_intelligence/cli/commands/benchmark_cmd.py +303 -0
- semantic_code_intelligence/cli/commands/chat_cmd.py +252 -0
- semantic_code_intelligence/cli/commands/ci_gen_cmd.py +74 -0
- semantic_code_intelligence/cli/commands/context_cmd.py +120 -0
- semantic_code_intelligence/cli/commands/cross_refactor_cmd.py +113 -0
- semantic_code_intelligence/cli/commands/deps_cmd.py +91 -0
- semantic_code_intelligence/cli/commands/docs_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/doctor_cmd.py +147 -0
- semantic_code_intelligence/cli/commands/evolve_cmd.py +171 -0
- semantic_code_intelligence/cli/commands/explain_cmd.py +112 -0
- semantic_code_intelligence/cli/commands/gate_cmd.py +135 -0
- semantic_code_intelligence/cli/commands/grep_cmd.py +234 -0
- semantic_code_intelligence/cli/commands/hotspots_cmd.py +119 -0
- semantic_code_intelligence/cli/commands/impact_cmd.py +131 -0
- semantic_code_intelligence/cli/commands/index_cmd.py +138 -0
- semantic_code_intelligence/cli/commands/init_cmd.py +152 -0
- semantic_code_intelligence/cli/commands/investigate_cmd.py +163 -0
- semantic_code_intelligence/cli/commands/languages_cmd.py +101 -0
- semantic_code_intelligence/cli/commands/lsp_cmd.py +49 -0
- semantic_code_intelligence/cli/commands/mcp_cmd.py +50 -0
- semantic_code_intelligence/cli/commands/metrics_cmd.py +264 -0
- semantic_code_intelligence/cli/commands/models_cmd.py +157 -0
- semantic_code_intelligence/cli/commands/plugin_cmd.py +275 -0
- semantic_code_intelligence/cli/commands/pr_summary_cmd.py +178 -0
- semantic_code_intelligence/cli/commands/quality_cmd.py +208 -0
- semantic_code_intelligence/cli/commands/refactor_cmd.py +103 -0
- semantic_code_intelligence/cli/commands/review_cmd.py +88 -0
- semantic_code_intelligence/cli/commands/search_cmd.py +236 -0
- semantic_code_intelligence/cli/commands/serve_cmd.py +117 -0
- semantic_code_intelligence/cli/commands/suggest_cmd.py +100 -0
- semantic_code_intelligence/cli/commands/summary_cmd.py +78 -0
- semantic_code_intelligence/cli/commands/tool_cmd.py +282 -0
- semantic_code_intelligence/cli/commands/trace_cmd.py +123 -0
- semantic_code_intelligence/cli/commands/tui_cmd.py +58 -0
- semantic_code_intelligence/cli/commands/viz_cmd.py +127 -0
- semantic_code_intelligence/cli/commands/watch_cmd.py +72 -0
- semantic_code_intelligence/cli/commands/web_cmd.py +61 -0
- semantic_code_intelligence/cli/commands/workspace_cmd.py +250 -0
- semantic_code_intelligence/cli/main.py +65 -0
- semantic_code_intelligence/cli/router.py +92 -0
- semantic_code_intelligence/config/__init__.py +0 -0
- semantic_code_intelligence/config/settings.py +260 -0
- semantic_code_intelligence/context/__init__.py +19 -0
- semantic_code_intelligence/context/engine.py +429 -0
- semantic_code_intelligence/context/memory.py +253 -0
- semantic_code_intelligence/daemon/__init__.py +1 -0
- semantic_code_intelligence/daemon/watcher.py +515 -0
- semantic_code_intelligence/docs/__init__.py +1080 -0
- semantic_code_intelligence/embeddings/__init__.py +0 -0
- semantic_code_intelligence/embeddings/enhanced.py +131 -0
- semantic_code_intelligence/embeddings/generator.py +149 -0
- semantic_code_intelligence/embeddings/model_registry.py +100 -0
- semantic_code_intelligence/evolution/__init__.py +1 -0
- semantic_code_intelligence/evolution/budget_guard.py +111 -0
- semantic_code_intelligence/evolution/commit_manager.py +88 -0
- semantic_code_intelligence/evolution/context_builder.py +131 -0
- semantic_code_intelligence/evolution/engine.py +249 -0
- semantic_code_intelligence/evolution/patch_generator.py +229 -0
- semantic_code_intelligence/evolution/task_selector.py +214 -0
- semantic_code_intelligence/evolution/test_runner.py +111 -0
- semantic_code_intelligence/indexing/__init__.py +0 -0
- semantic_code_intelligence/indexing/chunker.py +174 -0
- semantic_code_intelligence/indexing/parallel.py +86 -0
- semantic_code_intelligence/indexing/scanner.py +146 -0
- semantic_code_intelligence/indexing/semantic_chunker.py +337 -0
- semantic_code_intelligence/llm/__init__.py +62 -0
- semantic_code_intelligence/llm/cache.py +219 -0
- semantic_code_intelligence/llm/cached_provider.py +145 -0
- semantic_code_intelligence/llm/conversation.py +190 -0
- semantic_code_intelligence/llm/cross_refactor.py +272 -0
- semantic_code_intelligence/llm/investigation.py +274 -0
- semantic_code_intelligence/llm/mock_provider.py +77 -0
- semantic_code_intelligence/llm/ollama_provider.py +122 -0
- semantic_code_intelligence/llm/openai_provider.py +100 -0
- semantic_code_intelligence/llm/provider.py +92 -0
- semantic_code_intelligence/llm/rate_limiter.py +164 -0
- semantic_code_intelligence/llm/reasoning.py +438 -0
- semantic_code_intelligence/llm/safety.py +110 -0
- semantic_code_intelligence/llm/streaming.py +251 -0
- semantic_code_intelligence/lsp/__init__.py +609 -0
- semantic_code_intelligence/mcp/__init__.py +393 -0
- semantic_code_intelligence/parsing/__init__.py +19 -0
- semantic_code_intelligence/parsing/parser.py +375 -0
- semantic_code_intelligence/plugins/__init__.py +255 -0
- semantic_code_intelligence/plugins/examples/__init__.py +1 -0
- semantic_code_intelligence/plugins/examples/code_quality.py +73 -0
- semantic_code_intelligence/plugins/examples/search_annotator.py +56 -0
- semantic_code_intelligence/scalability/__init__.py +205 -0
- semantic_code_intelligence/search/__init__.py +0 -0
- semantic_code_intelligence/search/formatter.py +123 -0
- semantic_code_intelligence/search/grep.py +361 -0
- semantic_code_intelligence/search/hybrid_search.py +170 -0
- semantic_code_intelligence/search/keyword_search.py +311 -0
- semantic_code_intelligence/search/section_expander.py +103 -0
- semantic_code_intelligence/services/__init__.py +0 -0
- semantic_code_intelligence/services/indexing_service.py +630 -0
- semantic_code_intelligence/services/search_service.py +269 -0
- semantic_code_intelligence/storage/__init__.py +0 -0
- semantic_code_intelligence/storage/chunk_hash_store.py +86 -0
- semantic_code_intelligence/storage/hash_store.py +66 -0
- semantic_code_intelligence/storage/index_manifest.py +85 -0
- semantic_code_intelligence/storage/index_stats.py +138 -0
- semantic_code_intelligence/storage/query_history.py +160 -0
- semantic_code_intelligence/storage/symbol_registry.py +209 -0
- semantic_code_intelligence/storage/vector_store.py +297 -0
- semantic_code_intelligence/tests/__init__.py +0 -0
- semantic_code_intelligence/tests/test_ai_features.py +351 -0
- semantic_code_intelligence/tests/test_chunker.py +119 -0
- semantic_code_intelligence/tests/test_cli.py +188 -0
- semantic_code_intelligence/tests/test_config.py +154 -0
- semantic_code_intelligence/tests/test_context.py +381 -0
- semantic_code_intelligence/tests/test_embeddings.py +73 -0
- semantic_code_intelligence/tests/test_endtoend.py +1142 -0
- semantic_code_intelligence/tests/test_enhanced_embeddings.py +92 -0
- semantic_code_intelligence/tests/test_hash_store.py +79 -0
- semantic_code_intelligence/tests/test_logging.py +55 -0
- semantic_code_intelligence/tests/test_new_cli.py +138 -0
- semantic_code_intelligence/tests/test_parser.py +495 -0
- semantic_code_intelligence/tests/test_phase10.py +355 -0
- semantic_code_intelligence/tests/test_phase11.py +593 -0
- semantic_code_intelligence/tests/test_phase12.py +375 -0
- semantic_code_intelligence/tests/test_phase13.py +663 -0
- semantic_code_intelligence/tests/test_phase14.py +568 -0
- semantic_code_intelligence/tests/test_phase15.py +814 -0
- semantic_code_intelligence/tests/test_phase16.py +792 -0
- semantic_code_intelligence/tests/test_phase17.py +815 -0
- semantic_code_intelligence/tests/test_phase18.py +934 -0
- semantic_code_intelligence/tests/test_phase19.py +986 -0
- semantic_code_intelligence/tests/test_phase20.py +2753 -0
- semantic_code_intelligence/tests/test_phase20b.py +2058 -0
- semantic_code_intelligence/tests/test_phase20c.py +962 -0
- semantic_code_intelligence/tests/test_phase21.py +428 -0
- semantic_code_intelligence/tests/test_phase22.py +799 -0
- semantic_code_intelligence/tests/test_phase23.py +783 -0
- semantic_code_intelligence/tests/test_phase24.py +715 -0
- semantic_code_intelligence/tests/test_phase25.py +496 -0
- semantic_code_intelligence/tests/test_phase26.py +251 -0
- semantic_code_intelligence/tests/test_phase27.py +531 -0
- semantic_code_intelligence/tests/test_phase8.py +592 -0
- semantic_code_intelligence/tests/test_phase9.py +643 -0
- semantic_code_intelligence/tests/test_plugins.py +293 -0
- semantic_code_intelligence/tests/test_priority_features.py +727 -0
- semantic_code_intelligence/tests/test_router.py +41 -0
- semantic_code_intelligence/tests/test_scalability.py +138 -0
- semantic_code_intelligence/tests/test_scanner.py +125 -0
- semantic_code_intelligence/tests/test_search.py +160 -0
- semantic_code_intelligence/tests/test_semantic_chunker.py +255 -0
- semantic_code_intelligence/tests/test_tools.py +182 -0
- semantic_code_intelligence/tests/test_vector_store.py +151 -0
- semantic_code_intelligence/tests/test_watcher.py +211 -0
- semantic_code_intelligence/tools/__init__.py +442 -0
- semantic_code_intelligence/tools/executor.py +232 -0
- semantic_code_intelligence/tools/protocol.py +200 -0
- semantic_code_intelligence/tui/__init__.py +454 -0
- semantic_code_intelligence/utils/__init__.py +0 -0
- semantic_code_intelligence/utils/logging.py +112 -0
- semantic_code_intelligence/version.py +3 -0
- semantic_code_intelligence/web/__init__.py +11 -0
- semantic_code_intelligence/web/api.py +289 -0
- semantic_code_intelligence/web/server.py +397 -0
- semantic_code_intelligence/web/ui.py +659 -0
- semantic_code_intelligence/web/visualize.py +226 -0
- semantic_code_intelligence/workspace/__init__.py +427 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Repository scanner — walks the file tree and filters indexable files.
|
|
2
|
+
|
|
3
|
+
Respects ``.gitignore`` and ``.codexaignore`` patterns for fine-grained
|
|
4
|
+
exclusion control.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import fnmatch
|
|
10
|
+
import hashlib
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from semantic_code_intelligence.config.settings import AppConfig, IndexConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ScannedFile:
|
|
19
|
+
"""Represents a single file discovered during scanning."""
|
|
20
|
+
|
|
21
|
+
path: Path
|
|
22
|
+
relative_path: str
|
|
23
|
+
extension: str
|
|
24
|
+
size_bytes: int
|
|
25
|
+
content_hash: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def compute_file_hash(file_path: Path) -> str:
|
|
29
|
+
"""Compute a SHA-256 hash of a file's contents for change detection.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
file_path: Path to the file.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Hex-encoded SHA-256 digest.
|
|
36
|
+
"""
|
|
37
|
+
hasher = hashlib.sha256()
|
|
38
|
+
with open(file_path, "rb") as f:
|
|
39
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
40
|
+
hasher.update(chunk)
|
|
41
|
+
return hasher.hexdigest()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _load_ignore_patterns(root: Path) -> list[str]:
|
|
45
|
+
"""Load glob patterns from .codexaignore file (if it exists).
|
|
46
|
+
|
|
47
|
+
Each non-empty, non-comment line is treated as a glob pattern
|
|
48
|
+
matched against relative paths (similar to .gitignore).
|
|
49
|
+
"""
|
|
50
|
+
ignore_file = root / ".codexaignore"
|
|
51
|
+
if not ignore_file.exists():
|
|
52
|
+
return []
|
|
53
|
+
patterns: list[str] = []
|
|
54
|
+
for line in ignore_file.read_text(encoding="utf-8", errors="replace").splitlines():
|
|
55
|
+
stripped = line.strip()
|
|
56
|
+
if stripped and not stripped.startswith("#"):
|
|
57
|
+
patterns.append(stripped)
|
|
58
|
+
return patterns
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _matches_ignore_patterns(relative_path: str, patterns: list[str]) -> bool:
|
|
62
|
+
"""Check whether a relative path matches any .codexaignore pattern."""
|
|
63
|
+
for pattern in patterns:
|
|
64
|
+
if fnmatch.fnmatch(relative_path, pattern):
|
|
65
|
+
return True
|
|
66
|
+
# Also check against each path component for directory patterns
|
|
67
|
+
if fnmatch.fnmatch(relative_path.replace("\\", "/"), pattern):
|
|
68
|
+
return True
|
|
69
|
+
return False
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def should_ignore(path: Path, root: Path, ignore_dirs: set[str]) -> bool:
|
|
73
|
+
"""Check if a path should be ignored based on directory names.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
path: The file or directory path to check.
|
|
77
|
+
root: The project root path.
|
|
78
|
+
ignore_dirs: Set of directory names to ignore.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
True if the path should be skipped.
|
|
82
|
+
"""
|
|
83
|
+
try:
|
|
84
|
+
parts = path.relative_to(root).parts
|
|
85
|
+
except ValueError:
|
|
86
|
+
return True
|
|
87
|
+
return any(part in ignore_dirs for part in parts)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def scan_repository(
|
|
91
|
+
root: Path,
|
|
92
|
+
index_config: IndexConfig | None = None,
|
|
93
|
+
) -> list[ScannedFile]:
|
|
94
|
+
"""Scan a repository and return a list of indexable files.
|
|
95
|
+
|
|
96
|
+
Respects both the config-based ``ignore_dirs`` and any patterns
|
|
97
|
+
defined in ``.codexaignore`` at the project root.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
root: Root directory to scan.
|
|
101
|
+
index_config: Indexing configuration. Uses defaults if None.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
List of ScannedFile objects for all matching files.
|
|
105
|
+
"""
|
|
106
|
+
if index_config is None:
|
|
107
|
+
index_config = IndexConfig()
|
|
108
|
+
|
|
109
|
+
root = root.resolve()
|
|
110
|
+
ignore_patterns = _load_ignore_patterns(root)
|
|
111
|
+
results: list[ScannedFile] = []
|
|
112
|
+
|
|
113
|
+
for file_path in sorted(root.rglob("*")):
|
|
114
|
+
if not file_path.is_file():
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
if file_path.suffix not in index_config.extensions:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
if should_ignore(file_path, root, index_config.ignore_dirs):
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
# Check .codexaignore patterns
|
|
124
|
+
try:
|
|
125
|
+
rel = str(file_path.relative_to(root)).replace("\\", "/")
|
|
126
|
+
except ValueError:
|
|
127
|
+
continue
|
|
128
|
+
if ignore_patterns and _matches_ignore_patterns(rel, ignore_patterns):
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
size = file_path.stat().st_size
|
|
133
|
+
content_hash = compute_file_hash(file_path)
|
|
134
|
+
results.append(
|
|
135
|
+
ScannedFile(
|
|
136
|
+
path=file_path,
|
|
137
|
+
relative_path=str(file_path.relative_to(root)),
|
|
138
|
+
extension=file_path.suffix,
|
|
139
|
+
size_bytes=size,
|
|
140
|
+
content_hash=content_hash,
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
except (OSError, PermissionError):
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
return results
|
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
"""AST-aware semantic chunker — splits code along structural boundaries.
|
|
2
|
+
|
|
3
|
+
Uses tree-sitter parsed symbols to produce chunks aligned to function,
|
|
4
|
+
class, and method boundaries rather than arbitrary line counts. Falls
|
|
5
|
+
back to the line-based chunker for unsupported languages.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from semantic_code_intelligence.indexing.chunker import (
|
|
15
|
+
CodeChunk,
|
|
16
|
+
chunk_code,
|
|
17
|
+
detect_language,
|
|
18
|
+
)
|
|
19
|
+
from semantic_code_intelligence.parsing.parser import (
|
|
20
|
+
Symbol,
|
|
21
|
+
parse_file,
|
|
22
|
+
detect_language as detect_ts_language,
|
|
23
|
+
)
|
|
24
|
+
from semantic_code_intelligence.utils.logging import get_logger
|
|
25
|
+
|
|
26
|
+
logger = get_logger("indexing.semantic_chunker")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SemanticChunk(CodeChunk):
|
|
31
|
+
"""A chunk with additional semantic metadata."""
|
|
32
|
+
|
|
33
|
+
symbol_name: str = ""
|
|
34
|
+
symbol_kind: str = "" # "function", "class", "method", "module_header", "block"
|
|
35
|
+
parent_symbol: str = ""
|
|
36
|
+
parameters: list[str] = field(default_factory=list)
|
|
37
|
+
semantic_label: str = "" # human-readable label for the chunk
|
|
38
|
+
|
|
39
|
+
def to_dict(self) -> dict[str, Any]:
|
|
40
|
+
return {
|
|
41
|
+
"file_path": self.file_path,
|
|
42
|
+
"content": self.content,
|
|
43
|
+
"start_line": self.start_line,
|
|
44
|
+
"end_line": self.end_line,
|
|
45
|
+
"chunk_index": self.chunk_index,
|
|
46
|
+
"language": self.language,
|
|
47
|
+
"symbol_name": self.symbol_name,
|
|
48
|
+
"symbol_kind": self.symbol_kind,
|
|
49
|
+
"parent_symbol": self.parent_symbol,
|
|
50
|
+
"parameters": self.parameters,
|
|
51
|
+
"semantic_label": self.semantic_label,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _build_semantic_label(chunk: "SemanticChunk") -> str:
|
|
56
|
+
"""Build a human-readable label for embedding prepend."""
|
|
57
|
+
parts: list[str] = []
|
|
58
|
+
if chunk.language and chunk.language != "unknown":
|
|
59
|
+
parts.append(f"[{chunk.language}]")
|
|
60
|
+
if chunk.symbol_kind:
|
|
61
|
+
parts.append(chunk.symbol_kind)
|
|
62
|
+
if chunk.parent_symbol:
|
|
63
|
+
parts.append(f"{chunk.parent_symbol}.{chunk.symbol_name}")
|
|
64
|
+
elif chunk.symbol_name:
|
|
65
|
+
parts.append(chunk.symbol_name)
|
|
66
|
+
if chunk.parameters:
|
|
67
|
+
parts.append(f"({', '.join(chunk.parameters)})")
|
|
68
|
+
return " ".join(parts)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _symbols_to_chunks(
|
|
72
|
+
symbols: list[Symbol],
|
|
73
|
+
content: str,
|
|
74
|
+
file_path: str,
|
|
75
|
+
language: str,
|
|
76
|
+
max_chunk_size: int = 512,
|
|
77
|
+
) -> list[SemanticChunk]:
|
|
78
|
+
"""Convert parsed symbols into semantic chunks.
|
|
79
|
+
|
|
80
|
+
Large symbols that exceed max_chunk_size are sub-split at line
|
|
81
|
+
boundaries while preserving the semantic metadata.
|
|
82
|
+
"""
|
|
83
|
+
chunks: list[SemanticChunk] = []
|
|
84
|
+
lines = content.splitlines(keepends=True)
|
|
85
|
+
covered_lines: set[int] = set() # 1-indexed lines covered by symbols
|
|
86
|
+
chunk_index = 0
|
|
87
|
+
|
|
88
|
+
# Sort symbols by start_line for deterministic output
|
|
89
|
+
sorted_symbols = sorted(symbols, key=lambda s: (s.start_line, -s.end_line))
|
|
90
|
+
|
|
91
|
+
for sym in sorted_symbols:
|
|
92
|
+
if sym.kind == "import":
|
|
93
|
+
continue # imports are collected separately
|
|
94
|
+
|
|
95
|
+
body = sym.body
|
|
96
|
+
if not body.strip():
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Mark lines as covered
|
|
100
|
+
for ln in range(sym.start_line, sym.end_line + 1):
|
|
101
|
+
covered_lines.add(ln)
|
|
102
|
+
|
|
103
|
+
# If body fits in one chunk, emit directly
|
|
104
|
+
if len(body) <= max_chunk_size:
|
|
105
|
+
sc = SemanticChunk(
|
|
106
|
+
file_path=file_path,
|
|
107
|
+
content=body,
|
|
108
|
+
start_line=sym.start_line,
|
|
109
|
+
end_line=sym.end_line,
|
|
110
|
+
chunk_index=chunk_index,
|
|
111
|
+
language=language,
|
|
112
|
+
symbol_name=sym.name,
|
|
113
|
+
symbol_kind=sym.kind,
|
|
114
|
+
parent_symbol=sym.parent or "",
|
|
115
|
+
parameters=list(sym.parameters),
|
|
116
|
+
)
|
|
117
|
+
sc.semantic_label = _build_semantic_label(sc)
|
|
118
|
+
chunks.append(sc)
|
|
119
|
+
chunk_index += 1
|
|
120
|
+
else:
|
|
121
|
+
# Sub-split large symbols at line boundaries
|
|
122
|
+
body_lines = body.splitlines(keepends=True)
|
|
123
|
+
sub_lines: list[str] = []
|
|
124
|
+
sub_start = sym.start_line
|
|
125
|
+
sub_chars = 0
|
|
126
|
+
|
|
127
|
+
for offset, line in enumerate(body_lines):
|
|
128
|
+
sub_lines.append(line)
|
|
129
|
+
sub_chars += len(line)
|
|
130
|
+
|
|
131
|
+
if sub_chars >= max_chunk_size:
|
|
132
|
+
sc = SemanticChunk(
|
|
133
|
+
file_path=file_path,
|
|
134
|
+
content="".join(sub_lines),
|
|
135
|
+
start_line=sub_start,
|
|
136
|
+
end_line=sym.start_line + offset,
|
|
137
|
+
chunk_index=chunk_index,
|
|
138
|
+
language=language,
|
|
139
|
+
symbol_name=sym.name,
|
|
140
|
+
symbol_kind=sym.kind,
|
|
141
|
+
parent_symbol=sym.parent or "",
|
|
142
|
+
parameters=list(sym.parameters),
|
|
143
|
+
)
|
|
144
|
+
sc.semantic_label = _build_semantic_label(sc)
|
|
145
|
+
chunks.append(sc)
|
|
146
|
+
chunk_index += 1
|
|
147
|
+
sub_lines = []
|
|
148
|
+
sub_start = sym.start_line + offset + 1
|
|
149
|
+
sub_chars = 0
|
|
150
|
+
|
|
151
|
+
if sub_lines and "".join(sub_lines).strip():
|
|
152
|
+
sc = SemanticChunk(
|
|
153
|
+
file_path=file_path,
|
|
154
|
+
content="".join(sub_lines),
|
|
155
|
+
start_line=sub_start,
|
|
156
|
+
end_line=sym.end_line,
|
|
157
|
+
chunk_index=chunk_index,
|
|
158
|
+
language=language,
|
|
159
|
+
symbol_name=sym.name,
|
|
160
|
+
symbol_kind=sym.kind,
|
|
161
|
+
parent_symbol=sym.parent or "",
|
|
162
|
+
parameters=list(sym.parameters),
|
|
163
|
+
)
|
|
164
|
+
sc.semantic_label = _build_semantic_label(sc)
|
|
165
|
+
chunks.append(sc)
|
|
166
|
+
chunk_index += 1
|
|
167
|
+
|
|
168
|
+
# Collect uncovered regions (module-level code, imports header, etc.)
|
|
169
|
+
uncovered_blocks = _extract_uncovered_blocks(lines, covered_lines)
|
|
170
|
+
for start_line, end_line, block_content in uncovered_blocks:
|
|
171
|
+
if not block_content.strip():
|
|
172
|
+
continue
|
|
173
|
+
if len(block_content) <= max_chunk_size:
|
|
174
|
+
sc = SemanticChunk(
|
|
175
|
+
file_path=file_path,
|
|
176
|
+
content=block_content,
|
|
177
|
+
start_line=start_line,
|
|
178
|
+
end_line=end_line,
|
|
179
|
+
chunk_index=chunk_index,
|
|
180
|
+
language=language,
|
|
181
|
+
symbol_name="",
|
|
182
|
+
symbol_kind="module_header" if start_line <= 5 else "block",
|
|
183
|
+
)
|
|
184
|
+
sc.semantic_label = _build_semantic_label(sc)
|
|
185
|
+
chunks.append(sc)
|
|
186
|
+
chunk_index += 1
|
|
187
|
+
else:
|
|
188
|
+
# Sub-split large uncovered blocks
|
|
189
|
+
block_lines = block_content.splitlines(keepends=True)
|
|
190
|
+
buf: list[str] = []
|
|
191
|
+
buf_start = start_line
|
|
192
|
+
buf_chars = 0
|
|
193
|
+
for offset, line in enumerate(block_lines):
|
|
194
|
+
buf.append(line)
|
|
195
|
+
buf_chars += len(line)
|
|
196
|
+
if buf_chars >= max_chunk_size:
|
|
197
|
+
sc = SemanticChunk(
|
|
198
|
+
file_path=file_path,
|
|
199
|
+
content="".join(buf),
|
|
200
|
+
start_line=buf_start,
|
|
201
|
+
end_line=start_line + offset,
|
|
202
|
+
chunk_index=chunk_index,
|
|
203
|
+
language=language,
|
|
204
|
+
symbol_name="",
|
|
205
|
+
symbol_kind="block",
|
|
206
|
+
)
|
|
207
|
+
sc.semantic_label = _build_semantic_label(sc)
|
|
208
|
+
chunks.append(sc)
|
|
209
|
+
chunk_index += 1
|
|
210
|
+
buf = []
|
|
211
|
+
buf_start = start_line + offset + 1
|
|
212
|
+
buf_chars = 0
|
|
213
|
+
if buf and "".join(buf).strip():
|
|
214
|
+
sc = SemanticChunk(
|
|
215
|
+
file_path=file_path,
|
|
216
|
+
content="".join(buf),
|
|
217
|
+
start_line=buf_start,
|
|
218
|
+
end_line=end_line,
|
|
219
|
+
chunk_index=chunk_index,
|
|
220
|
+
language=language,
|
|
221
|
+
symbol_name="",
|
|
222
|
+
symbol_kind="block",
|
|
223
|
+
)
|
|
224
|
+
sc.semantic_label = _build_semantic_label(sc)
|
|
225
|
+
chunks.append(sc)
|
|
226
|
+
chunk_index += 1
|
|
227
|
+
|
|
228
|
+
# Sort by start_line for stable ordering
|
|
229
|
+
chunks.sort(key=lambda c: c.start_line)
|
|
230
|
+
for i, c in enumerate(chunks):
|
|
231
|
+
c.chunk_index = i
|
|
232
|
+
|
|
233
|
+
return chunks
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _extract_uncovered_blocks(
|
|
237
|
+
lines: list[str],
|
|
238
|
+
covered_lines: set[int],
|
|
239
|
+
) -> list[tuple[int, int, str]]:
|
|
240
|
+
"""Find contiguous blocks of lines not covered by any symbol.
|
|
241
|
+
|
|
242
|
+
Returns list of (start_line, end_line, content) tuples (1-indexed).
|
|
243
|
+
"""
|
|
244
|
+
blocks: list[tuple[int, int, str]] = []
|
|
245
|
+
block_start: int | None = None
|
|
246
|
+
block_lines: list[str] = []
|
|
247
|
+
|
|
248
|
+
for i, line in enumerate(lines):
|
|
249
|
+
line_num = i + 1 # 1-indexed
|
|
250
|
+
if line_num not in covered_lines:
|
|
251
|
+
if block_start is None:
|
|
252
|
+
block_start = line_num
|
|
253
|
+
block_lines.append(line)
|
|
254
|
+
else:
|
|
255
|
+
if block_start is not None:
|
|
256
|
+
blocks.append((block_start, line_num - 1, "".join(block_lines)))
|
|
257
|
+
block_start = None
|
|
258
|
+
block_lines = []
|
|
259
|
+
|
|
260
|
+
if block_start is not None:
|
|
261
|
+
blocks.append((block_start, len(lines), "".join(block_lines)))
|
|
262
|
+
|
|
263
|
+
return blocks
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def semantic_chunk_code(
|
|
267
|
+
content: str,
|
|
268
|
+
file_path: str,
|
|
269
|
+
chunk_size: int = 512,
|
|
270
|
+
chunk_overlap: int = 64,
|
|
271
|
+
) -> list[SemanticChunk]:
|
|
272
|
+
"""Split code into semantically meaningful chunks using AST analysis.
|
|
273
|
+
|
|
274
|
+
For supported languages (Python, JS, TypeScript, Java, Go, Rust, C++,
|
|
275
|
+
C#, Ruby, PHP), uses tree-sitter to identify symbol boundaries and
|
|
276
|
+
produces chunks aligned to function, class, and method definitions.
|
|
277
|
+
|
|
278
|
+
For unsupported languages, falls back to line-boundary chunking and
|
|
279
|
+
wraps the result as SemanticChunk objects.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
content: Full source code string.
|
|
283
|
+
file_path: Path for language detection and metadata.
|
|
284
|
+
chunk_size: Maximum characters per chunk.
|
|
285
|
+
chunk_overlap: Overlap chars (used only in fallback mode).
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
List of SemanticChunk objects.
|
|
289
|
+
"""
|
|
290
|
+
if not content.strip():
|
|
291
|
+
return []
|
|
292
|
+
|
|
293
|
+
language = detect_language(file_path)
|
|
294
|
+
ts_language = detect_ts_language(file_path)
|
|
295
|
+
|
|
296
|
+
# If tree-sitter supports this language, use AST-aware chunking
|
|
297
|
+
if ts_language is not None:
|
|
298
|
+
symbols = parse_file(file_path, content)
|
|
299
|
+
if symbols:
|
|
300
|
+
return _symbols_to_chunks(symbols, content, file_path, language, chunk_size)
|
|
301
|
+
|
|
302
|
+
# Fallback: wrap line-based chunks as SemanticChunks
|
|
303
|
+
line_chunks = chunk_code(content, file_path, chunk_size, chunk_overlap)
|
|
304
|
+
return [
|
|
305
|
+
SemanticChunk(
|
|
306
|
+
file_path=c.file_path,
|
|
307
|
+
content=c.content,
|
|
308
|
+
start_line=c.start_line,
|
|
309
|
+
end_line=c.end_line,
|
|
310
|
+
chunk_index=c.chunk_index,
|
|
311
|
+
language=c.language,
|
|
312
|
+
symbol_kind="block",
|
|
313
|
+
)
|
|
314
|
+
for c in line_chunks
|
|
315
|
+
]
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def semantic_chunk_file(
|
|
319
|
+
file_path: Path,
|
|
320
|
+
chunk_size: int = 512,
|
|
321
|
+
chunk_overlap: int = 64,
|
|
322
|
+
) -> list[SemanticChunk]:
|
|
323
|
+
"""Read a file and split into semantic chunks.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
file_path: Path to the source file.
|
|
327
|
+
chunk_size: Maximum characters per chunk.
|
|
328
|
+
chunk_overlap: Overlap for fallback mode.
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
List of SemanticChunk objects.
|
|
332
|
+
"""
|
|
333
|
+
try:
|
|
334
|
+
content = Path(file_path).read_text(encoding="utf-8", errors="replace")
|
|
335
|
+
except (OSError, PermissionError):
|
|
336
|
+
return []
|
|
337
|
+
return semantic_chunk_code(content, str(file_path), chunk_size, chunk_overlap)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""LLM integration layer — provider abstraction, reasoning engine, and safety.
|
|
2
|
+
|
|
3
|
+
Provides:
|
|
4
|
+
- LLMProvider: abstract base class for LLM backends
|
|
5
|
+
- OpenAIProvider: OpenAI API integration
|
|
6
|
+
- OllamaProvider: Ollama local model integration
|
|
7
|
+
- MockProvider: deterministic mock for testing
|
|
8
|
+
- CachedProvider: transparent caching and rate limiting wrapper
|
|
9
|
+
- LLMCache / CacheStats: disk-backed response cache with TTL
|
|
10
|
+
- RateLimiter / RateLimitExceeded: sliding-window rate limiting
|
|
11
|
+
- ReasoningEngine: orchestrates context + LLM for AI-assisted tasks
|
|
12
|
+
- SafetyValidator: validates LLM outputs before applying
|
|
13
|
+
- ConversationSession / SessionStore: multi-turn conversation persistence
|
|
14
|
+
- InvestigationChain: autonomous multi-step code investigation
|
|
15
|
+
- stream_chat / StreamEvent: streaming LLM responses with plugin hooks
|
|
16
|
+
- analyze_cross_repo: cross-repo refactoring suggestions
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from semantic_code_intelligence.llm.provider import (
|
|
22
|
+
LLMProvider,
|
|
23
|
+
LLMResponse,
|
|
24
|
+
LLMMessage,
|
|
25
|
+
)
|
|
26
|
+
from semantic_code_intelligence.llm.openai_provider import OpenAIProvider
|
|
27
|
+
from semantic_code_intelligence.llm.ollama_provider import OllamaProvider
|
|
28
|
+
from semantic_code_intelligence.llm.mock_provider import MockProvider
|
|
29
|
+
from semantic_code_intelligence.llm.cached_provider import CachedProvider
|
|
30
|
+
from semantic_code_intelligence.llm.cache import LLMCache, CacheStats
|
|
31
|
+
from semantic_code_intelligence.llm.rate_limiter import RateLimiter, RateLimitExceeded, RateLimiterStats
|
|
32
|
+
from semantic_code_intelligence.llm.reasoning import ReasoningEngine
|
|
33
|
+
from semantic_code_intelligence.llm.safety import SafetyValidator
|
|
34
|
+
from semantic_code_intelligence.llm.conversation import ConversationSession, SessionStore
|
|
35
|
+
from semantic_code_intelligence.llm.investigation import InvestigationChain, InvestigationResult
|
|
36
|
+
from semantic_code_intelligence.llm.streaming import stream_chat, StreamEvent
|
|
37
|
+
from semantic_code_intelligence.llm.cross_refactor import analyze_cross_repo, CrossRefactorResult
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"LLMProvider",
|
|
41
|
+
"LLMResponse",
|
|
42
|
+
"LLMMessage",
|
|
43
|
+
"OpenAIProvider",
|
|
44
|
+
"OllamaProvider",
|
|
45
|
+
"MockProvider",
|
|
46
|
+
"CachedProvider",
|
|
47
|
+
"LLMCache",
|
|
48
|
+
"CacheStats",
|
|
49
|
+
"RateLimiter",
|
|
50
|
+
"RateLimitExceeded",
|
|
51
|
+
"RateLimiterStats",
|
|
52
|
+
"ReasoningEngine",
|
|
53
|
+
"SafetyValidator",
|
|
54
|
+
"ConversationSession",
|
|
55
|
+
"SessionStore",
|
|
56
|
+
"InvestigationChain",
|
|
57
|
+
"InvestigationResult",
|
|
58
|
+
"stream_chat",
|
|
59
|
+
"StreamEvent",
|
|
60
|
+
"analyze_cross_repo",
|
|
61
|
+
"CrossRefactorResult",
|
|
62
|
+
]
|